添加直接输入网页下载功能,搜索功能还有bug没修复

This commit is contained in:
liyp 2023-07-18 12:49:14 +08:00
parent edd85309db
commit ce7e0042ba
7 changed files with 143 additions and 54 deletions

1
.gitignore vendored
View file

@ -162,3 +162,4 @@ cython_debug/
/test_get_txt.py /test_get_txt.py
/test_tmp.py /test_tmp.py
test_search.py test_search.py
test.py

6
.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,6 @@
{
"[python]": {
"editor.defaultFormatter": "ms-python.autopep8"
},
"python.formatting.provider": "none"
}

View file

@ -40,33 +40,42 @@ def get_user_agent():
class Download_Novel: class Download_Novel:
def search_novel(self): def search_novel(self):
hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}' # hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
result = requests.get(hm_url, headers=get_user_agent()).text # result = requests.get(hm_url, headers=get_user_agent()).text
# print(result) # # print(result)
hm = result[2:-2] # hm = result[2:-2]
# print(hm) # print(hm)
if self.name!=None:
# 发起请求并获取响应 # 发起请求并获取响应
url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}' url = f'https://www.bqg222.com/user/search.html?q={self.name}'
# print(url)
print(requests.get(
url, headers=get_user_agent()).text[1:-1])
response = json.loads(requests.get(url, headers=get_user_agent()).text[1:-1]) response = json.loads(requests.get(
url, headers=get_user_agent()).text[1:-1])
# print(type(response)) # print(type(response))
for i, book in enumerate(response): for i, book in enumerate(response):
# i['url_list'][:9] = 'https://www' # i['url_list'][:9] = 'https://www'
trans_url = book['url_list'].replace('https://m', 'https://www') # trans_url = book['url_list'].replace('https://m', 'https://www')
print(type(book['url_list']))
trans_url ='https://www.bqg221.com' + str(book['url_list'])
response[i]['url_list'] = trans_url response[i]['url_list'] = trans_url
# 返回一个json对象 # 返回一个json对象
return response return response
def get_novel_info(self, response): def get_novel_info(self, response=None):
# 定义请求间隔时间(秒) # 定义请求间隔时间(秒)
interval = 2 interval = 2
# 设置请求头,模拟浏览器访问 # 设置请求头,模拟浏览器访问
if response!=None:
# 要爬取的小说主页链接 # 要爬取的小说主页链接
url = response['url_list'] url = response['url_list']
else:
url = self.search_url
# 发起请求并获取响应 # 发起请求并获取响应
url_response = requests.get(url, headers=get_user_agent()) url_response = requests.get(url, headers=get_user_agent())
@ -76,7 +85,7 @@ class Download_Novel:
# 获取小说名字 # 获取小说名字
# title = soup.select_one('.book h1').get_text(strip=True) # title = soup.select_one('.book h1').get_text(strip=True)
self.title = response['articlename'] self.title = soup.select_one('.book h1').get_text(strip=True)
print(self.title) print(self.title)
# 获取小说简介 # 获取小说简介
@ -86,7 +95,7 @@ class Download_Novel:
all_span_tags = div_tag.find_all('span') all_span_tags = div_tag.find_all('span')
# print(all_span_tags) # print(all_span_tags)
# author = all_span_tags[0].text.strip()[3:] # author = all_span_tags[0].text.strip()[3:]
self.author = response['author'] self.author = all_span_tags[0].text.strip()[3:]
self.status = all_span_tags[1].text.strip() self.status = all_span_tags[1].text.strip()
self.update_time = all_span_tags[2].text.strip() self.update_time = all_span_tags[2].text.strip()
self.latest_update = all_span_tags[3].text.strip() self.latest_update = all_span_tags[3].text.strip()
@ -96,7 +105,7 @@ class Download_Novel:
print(self.intro) print(self.intro)
# cover = soup.select_one('.cover img')['src'] # cover = soup.select_one('.cover img')['src']
self.cover = response['url_img'] self.cover = soup.select_one('.cover img')['src']
# print(cover) # print(cover)
# 获取小说所有章节链接 # 获取小说所有章节链接
self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
@ -138,7 +147,8 @@ class Download_Novel:
# if not os.path.exists(file_path): # if not os.path.exists(file_path):
# os.makedirs(file_path) # os.makedirs(file_path)
# # print('文件夹不存在,创建文件夹') # # print('文件夹不存在,创建文件夹')
file_name, status = self.get_multi_txt_file_status(file_name=file_name) file_name, status = self.get_multi_txt_file_status(
file_name=file_name)
if status: if status:
print(file_name + ' 已存在,跳过...\n') print(file_name + ' 已存在,跳过...\n')
@ -149,15 +159,19 @@ class Download_Novel:
retry = 8 retry = 8
while retry > 0: while retry > 0:
try: try:
response = requests.get(chapter_url, headers=get_user_agent(), timeout=5) response = requests.get(
chapter_url, headers=get_user_agent(), timeout=5)
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
chapter_title = soup.select_one('.content h1').get_text() chapter_title = soup.select_one(
'.content h1').get_text()
print(chapter_title) print(chapter_title)
chapter_content = soup.select_one('div#chaptercontent').get_text().strip() chapter_content = soup.select_one(
'div#chaptercontent').get_text().strip()
# print('before: '+chapter_content) # print('before: '+chapter_content)
# # 将所有的<br>标签替换成换行符\n # # 将所有的<br>标签替换成换行符\n
chapter_content = chapter_content.replace('  ', '\n ') chapter_content = chapter_content.replace(
'  ', '\n ')
# chapter_content = chapter_content.replace('<br>', '\n') # chapter_content = chapter_content.replace('<br>', '\n')
content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content, content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content,
flags=re.MULTILINE) flags=re.MULTILINE)
@ -216,14 +230,16 @@ class Download_Novel:
convert_status = True convert_status = True
if convert_type == 0: if convert_type == 0:
print(self.file_path, self.download_path + self.title + '.txt') print(self.file_path, self.download_path + self.title + '.txt')
convert_status = self.merge_txt_file(self.download_path + self.title + '.txt') convert_status = self.merge_txt_file(
self.download_path + self.title + '.txt')
elif convert_type == 1: elif convert_type == 1:
txt_files = [] txt_files = []
for n in range(0, len(self.chapter_urls)): for n in range(0, len(self.chapter_urls)):
txt_files.append(self.file_path + str(n) + '.txt') txt_files.append(self.file_path + str(n) + '.txt')
convert_status = self.merge_txt_to_epub(txt_files, self.download_path + self.title + '.epub') convert_status = self.merge_txt_to_epub(
txt_files, self.download_path + self.title + '.epub')
if convert_status: if convert_status:
print('合并成功!') print('合并成功!')
@ -231,8 +247,8 @@ class Download_Novel:
print('合并失败请删除downloads下面目录后重新运行程序') print('合并失败请删除downloads下面目录后重新运行程序')
exit(1) exit(1)
# 合并为txt文件 # 合并为txt文件
def merge_txt_file(self, merged_file_name=''): def merge_txt_file(self, merged_file_name=''):
""" """
@ -242,7 +258,8 @@ class Download_Novel:
# os.chdir(file_path) # os.chdir(file_path)
if os.path.exists(merged_file_name): if os.path.exists(merged_file_name):
os.remove(merged_file_name) os.remove(merged_file_name)
print('merge file : ', sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0]))) print('merge file : ', sorted(os.listdir(self.file_path),
key=lambda x: int(x.split('.')[0])))
time.sleep(self.interval) time.sleep(self.interval)
with open(merged_file_name, 'wb') as outfile: with open(merged_file_name, 'wb') as outfile:
@ -306,7 +323,8 @@ class Download_Novel:
content = file.readlines() content = file.readlines()
try: try:
# 将所有换行符替换为<br> # 将所有换行符替换为<br>
content = [s.replace('\n', '') for s in content if len(s.strip()) > 0] content = [s.replace('\n', '')
for s in content if len(s.strip()) > 0]
# 获取章节标题 # 获取章节标题
chapter_title = content[0] chapter_title = content[0]
@ -319,18 +337,19 @@ class Download_Novel:
for j, line in enumerate(content[1:]): for j, line in enumerate(content[1:]):
content[j + 1] = '<p class="calibre3">' + line + '</p>\n' content[j + 1] = '<p class="calibre3">' + line + '</p>\n'
# content.append('</body></html>') # content.append('</body></html>')
except IndexError as e: except IndexError as e:
print(e) print(e)
return False return False
# 创建一个章节对象 # 创建一个章节对象
chapter = epub.EpubHtml(title=chapter_title, file_name='text/' + str(i) + '.xhtml') chapter = epub.EpubHtml(
title=chapter_title, file_name='text/' + str(i) + '.xhtml')
chapter.content = ''.join(content) # 将整个文件内容作为章节内容 chapter.content = ''.join(content) # 将整个文件内容作为章节内容
# 下面的是将css文件引用到单个章节里面 # 下面的是将css文件引用到单个章节里面
# page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read() # page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read() page_style1 = open('./css/page_styles1.css',
'r', encoding='utf-8').read()
style = open('./css/stylesheet.css', 'r', encoding='utf-8').read() style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
# chapter.add_item( # chapter.add_item(
# epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css", # epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css",
@ -345,7 +364,8 @@ class Download_Novel:
# 将章节添加到书籍中 # 将章节添加到书籍中
book.add_item(chapter) book.add_item(chapter)
book.spine.append(chapter) book.spine.append(chapter)
book.toc.append(epub.Link('text/' + str(i) + '.xhtml', chapter_title, str(i))) book.toc.append(epub.Link('text/' + str(i) +
'.xhtml', chapter_title, str(i)))
# print('xxxxxxxx:','text/' + str(i) + '.xhtml', chapter_title, str(i)) # print('xxxxxxxx:','text/' + str(i) + '.xhtml', chapter_title, str(i))
# 将目录添加到书籍中 # 将目录添加到书籍中
@ -354,11 +374,11 @@ class Download_Novel:
book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav()) book.add_item(epub.EpubNav())
# 设置书籍的样式文件 # 设置书籍的样式文件
page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read() page_style = open('./css/page_styles.css',
page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read() 'r', encoding='utf-8').read()
page_style1 = open('./css/page_styles1.css',
'r', encoding='utf-8').read()
style = open('./css/stylesheet.css', 'r', encoding='utf-8').read() style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
book.add_item( book.add_item(
@ -374,7 +394,7 @@ class Download_Novel:
epub.write_epub(epub_file, book, {}) epub.write_epub(epub_file, book, {})
return True return True
def __init__(self, name): def __init__(self, name=None,search_url=None):
self.file_path = None self.file_path = None
self.chapter_urls = None self.chapter_urls = None
self.cover = None self.cover = None
@ -383,6 +403,7 @@ class Download_Novel:
self.author = None self.author = None
self.title = None self.title = None
self.name = name self.name = name
self.search_url=search_url
# 定义请求间隔时间(秒) # 定义请求间隔时间(秒)
self.interval = 2 self.interval = 2
@ -392,11 +413,20 @@ class Download_Novel:
if __name__ == '__main__': if __name__ == '__main__':
search_type=input('请选择你要下载的方式(0 or 1)\n0) 使用名称搜索\n1) 直接输入url(格式如https://www.bqg221.com/biquge/17931/)\n')
# if isinstance(search_type, int) and 0 <= search_type <= 1:
# download_novel
download_novel = Download_Novel()
if search_type == str(0):
search_name = input('请输入要搜索的书籍名称: ') search_name = input('请输入要搜索的书籍名称: ')
if search_name: download_novel.name=search_name
download_novel = Download_Novel(search_name) else:
download_novel.search_url=search_type
response = download_novel.search_novel() response = download_novel.search_novel()
print(response) # print(response)
if download_novel.name!=None:
print('搜索到 ' + str(len(response)) + ' 个结果\n') print('搜索到 ' + str(len(response)) + ' 个结果\n')
print('---------------------------------------\n') print('---------------------------------------\n')
for i, book in enumerate(reversed(response)): for i, book in enumerate(reversed(response)):
@ -404,13 +434,25 @@ if __name__ == '__main__':
'intro'] + '...\n') 'intro'] + '...\n')
print('---------------------------------------') print('---------------------------------------')
print('---------------------------------------\n') print('---------------------------------------\n')
select_book = int(input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择)')) select_book = int(
input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择)'))
# 判断输入合法 # 判断输入合法
if isinstance(select_book, int) and 0 <= select_book <= len(response): if isinstance(select_book, int) and 0 <= select_book <= len(response):
download_novel.get_novel_info(response[select_book]) download_novel.get_novel_info(response[select_book])
download_novel.download_process() download_novel.download_process()
else: else:
print('输入内容不合法!') print('输入内容不合法!')
else: else:
exit(0) # print('---------------------------------------\n')
# # for i, book in enumerate(reversed(response)):
# print( ' 书籍名称:' + response['articlename'] + '\n作者' + response['author'] + '\n简介' + response[
# 'intro'] + '...\n')
# print('---------------------------------------')
# print('---------------------------------------\n')
download_novel.get_novel_info()
download_novel.download_process()
# else:
# exit(0)

View file

@ -7,3 +7,4 @@
- [x] 实现合并为epub - [x] 实现合并为epub
- [x] 实现搜索功能 - [x] 实现搜索功能
- [ ] 实现多个书源替换 - [ ] 实现多个书源替换
- [x] 实现直接输入网址下载

View file

@ -1,6 +1,23 @@
# This file is @generated by PDM. # This file is @generated by PDM.
# It is not intended for manual editing. # It is not intended for manual editing.
[[package]]
name = "beautifulsoup4"
version = "4.12.2"
requires_python = ">=3.6.0"
summary = "Screen-scraping library"
dependencies = [
"soupsieve>1.2",
]
[[package]]
name = "bs4"
version = "0.0.1"
summary = "Screen-scraping library"
dependencies = [
"beautifulsoup4",
]
[[package]] [[package]]
name = "certifi" name = "certifi"
version = "2023.5.7" version = "2023.5.7"
@ -58,6 +75,12 @@ version = "1.16.0"
requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
summary = "Python 2 and 3 compatibility utilities" summary = "Python 2 and 3 compatibility utilities"
[[package]]
name = "soupsieve"
version = "2.4.1"
requires_python = ">=3.7"
summary = "A modern CSS selector implementation for Beautiful Soup."
[[package]] [[package]]
name = "urllib3" name = "urllib3"
version = "2.0.3" version = "2.0.3"
@ -68,9 +91,16 @@ summary = "HTTP library with thread-safe connection pooling, file post, and more
lock_version = "4.2" lock_version = "4.2"
cross_platform = true cross_platform = true
groups = ["default"] groups = ["default"]
content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff297544cb6" content_hash = "sha256:cecd9231f4ed5227cc9f2c8a4225b2e84fc47b083042291cb5e26d8bc24a7199"
[metadata.files] [metadata.files]
"beautifulsoup4 4.12.2" = [
{url = "https://files.pythonhosted.org/packages/57/f4/a69c20ee4f660081a7dedb1ac57f29be9378e04edfcb90c526b923d4bebc/beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
{url = "https://files.pythonhosted.org/packages/af/0b/44c39cf3b18a9280950ad63a579ce395dda4c32193ee9da7ff0aed547094/beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
]
"bs4 0.0.1" = [
{url = "https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
]
"certifi 2023.5.7" = [ "certifi 2023.5.7" = [
{url = "https://files.pythonhosted.org/packages/93/71/752f7a4dd4c20d6b12341ed1732368546bc0ca9866139fe812f6009d9ac7/certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, {url = "https://files.pythonhosted.org/packages/93/71/752f7a4dd4c20d6b12341ed1732368546bc0ca9866139fe812f6009d9ac7/certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"},
{url = "https://files.pythonhosted.org/packages/9d/19/59961b522e6757f0c9097e4493fa906031b95b3ebe9360b2c3083561a6b4/certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, {url = "https://files.pythonhosted.org/packages/9d/19/59961b522e6757f0c9097e4493fa906031b95b3ebe9360b2c3083561a6b4/certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"},
@ -250,6 +280,10 @@ content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff29
{url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
{url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
] ]
"soupsieve 2.4.1" = [
{url = "https://files.pythonhosted.org/packages/47/9e/780779233a615777fbdf75a4dee2af7a345f4bf74b42d4a5f836800b9d91/soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
{url = "https://files.pythonhosted.org/packages/49/37/673d6490efc51ec46d198c75903d99de59baffdd47aea3d071b80a9e4e89/soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
]
"urllib3 2.0.3" = [ "urllib3 2.0.3" = [
{url = "https://files.pythonhosted.org/packages/8a/03/ad9306a50d05c166e3456fe810f33cee2b8b2a7a6818ec5d4908c4ec6b36/urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"}, {url = "https://files.pythonhosted.org/packages/8a/03/ad9306a50d05c166e3456fe810f33cee2b8b2a7a6818ec5d4908c4ec6b36/urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"},
{url = "https://files.pythonhosted.org/packages/d6/af/3b4cfedd46b3addab52e84a71ab26518272c23c77116de3c61ead54af903/urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"}, {url = "https://files.pythonhosted.org/packages/d6/af/3b4cfedd46b3addab52e84a71ab26518272c23c77116de3c61ead54af903/urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"},

View file

@ -10,6 +10,7 @@ dependencies = [
"requests>=2.31.0", "requests>=2.31.0",
"ebooklib>=0.18", "ebooklib>=0.18",
"setuptools>=68.0.0", "setuptools>=68.0.0",
"bs4>=0.0.1",
] ]
requires-python = ">=3.11" requires-python = ">=3.11"
license = {text = "MIT"} license = {text = "MIT"}

View file

@ -35,19 +35,23 @@ class Download_Novel:
def search_novel(self): def search_novel(self):
# 定义请求间隔时间(秒) # 定义请求间隔时间(秒)
interval = 2 interval = 2
hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}' # hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
result = requests.get(hm_url, headers=self.get_user_agent()).text # result = requests.get(hm_url, headers=self.get_user_agent()).text
# print(result) # print(result)
hm = result[2:-2] # hm = result[2:-2]
# print(hm) # print(hm)
# 发起请求并获取响应 # 发起请求并获取响应
url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}' url = f'https://www.bqg221.com/user/search.html?q={self.name}'
print(url)
print('响应内容:',requests.get(url, headers=self.get_user_agent()).text)
response = json.loads(requests.get(url, headers=self.get_user_agent()).text[1:-1]) response = json.loads(requests.get(url, headers=self.get_user_agent()).text[1:-1])
print(response)
# print(type(response)) # print(type(response))
for i, book in enumerate(response): for i, book in enumerate(response):
# i['url_list'][:9] = 'https://www' # i['url_list'][:9] = 'https://www'
trans_url = book['url_list'].replace('https://m', 'https://www') # trans_url = book['url_list'].replace('https://m', 'https://www')
print(type(book['url_list']))
trans_url ='https://www.bqg221.com' + str(book['url_list'])
response[i]['url_list'] = trans_url response[i]['url_list'] = trans_url
# 返回一个json对象 # 返回一个json对象