添加直接输入网页下载功能,搜索功能还有bug没修复
This commit is contained in:
parent
edd85309db
commit
ce7e0042ba
7 changed files with 143 additions and 54 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -162,3 +162,4 @@ cython_debug/
|
|||
/test_get_txt.py
|
||||
/test_tmp.py
|
||||
test_search.py
|
||||
test.py
|
||||
|
|
6
.vscode/settings.json
vendored
Normal file
6
.vscode/settings.json
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "ms-python.autopep8"
|
||||
},
|
||||
"python.formatting.provider": "none"
|
||||
}
|
|
@ -40,33 +40,42 @@ def get_user_agent():
|
|||
class Download_Novel:
|
||||
|
||||
def search_novel(self):
|
||||
hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
|
||||
result = requests.get(hm_url, headers=get_user_agent()).text
|
||||
# print(result)
|
||||
hm = result[2:-2]
|
||||
# hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
|
||||
# result = requests.get(hm_url, headers=get_user_agent()).text
|
||||
# # print(result)
|
||||
# hm = result[2:-2]
|
||||
# print(hm)
|
||||
if self.name!=None:
|
||||
# 发起请求并获取响应
|
||||
url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}'
|
||||
url = f'https://www.bqg222.com/user/search.html?q={self.name}'
|
||||
# print(url)
|
||||
print(requests.get(
|
||||
url, headers=get_user_agent()).text[1:-1])
|
||||
|
||||
response = json.loads(requests.get(url, headers=get_user_agent()).text[1:-1])
|
||||
# print(type(response))
|
||||
for i, book in enumerate(response):
|
||||
# i['url_list'][:9] = 'https://www'
|
||||
trans_url = book['url_list'].replace('https://m', 'https://www')
|
||||
response[i]['url_list'] = trans_url
|
||||
response = json.loads(requests.get(
|
||||
url, headers=get_user_agent()).text[1:-1])
|
||||
# print(type(response))
|
||||
for i, book in enumerate(response):
|
||||
# i['url_list'][:9] = 'https://www'
|
||||
# trans_url = book['url_list'].replace('https://m', 'https://www')
|
||||
print(type(book['url_list']))
|
||||
trans_url ='https://www.bqg221.com' + str(book['url_list'])
|
||||
response[i]['url_list'] = trans_url
|
||||
|
||||
# 返回一个json对象
|
||||
return response
|
||||
# 返回一个json对象
|
||||
return response
|
||||
|
||||
def get_novel_info(self, response):
|
||||
def get_novel_info(self, response=None):
|
||||
|
||||
# 定义请求间隔时间(秒)
|
||||
interval = 2
|
||||
|
||||
# 设置请求头,模拟浏览器访问
|
||||
|
||||
if response!=None:
|
||||
# 要爬取的小说主页链接
|
||||
url = response['url_list']
|
||||
url = response['url_list']
|
||||
else:
|
||||
url = self.search_url
|
||||
|
||||
# 发起请求并获取响应
|
||||
url_response = requests.get(url, headers=get_user_agent())
|
||||
|
@ -76,7 +85,7 @@ class Download_Novel:
|
|||
|
||||
# 获取小说名字
|
||||
# title = soup.select_one('.book h1').get_text(strip=True)
|
||||
self.title = response['articlename']
|
||||
self.title = soup.select_one('.book h1').get_text(strip=True)
|
||||
print(self.title)
|
||||
|
||||
# 获取小说简介
|
||||
|
@ -86,7 +95,7 @@ class Download_Novel:
|
|||
all_span_tags = div_tag.find_all('span')
|
||||
# print(all_span_tags)
|
||||
# author = all_span_tags[0].text.strip()[3:]
|
||||
self.author = response['author']
|
||||
self.author = all_span_tags[0].text.strip()[3:]
|
||||
self.status = all_span_tags[1].text.strip()
|
||||
self.update_time = all_span_tags[2].text.strip()
|
||||
self.latest_update = all_span_tags[3].text.strip()
|
||||
|
@ -96,7 +105,7 @@ class Download_Novel:
|
|||
print(self.intro)
|
||||
|
||||
# cover = soup.select_one('.cover img')['src']
|
||||
self.cover = response['url_img']
|
||||
self.cover = soup.select_one('.cover img')['src']
|
||||
# print(cover)
|
||||
# 获取小说所有章节链接
|
||||
self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
|
||||
|
@ -138,7 +147,8 @@ class Download_Novel:
|
|||
# if not os.path.exists(file_path):
|
||||
# os.makedirs(file_path)
|
||||
# # print('文件夹不存在,创建文件夹')
|
||||
file_name, status = self.get_multi_txt_file_status(file_name=file_name)
|
||||
file_name, status = self.get_multi_txt_file_status(
|
||||
file_name=file_name)
|
||||
|
||||
if status:
|
||||
print(file_name + ' 已存在,跳过...\n')
|
||||
|
@ -149,15 +159,19 @@ class Download_Novel:
|
|||
retry = 8
|
||||
while retry > 0:
|
||||
try:
|
||||
response = requests.get(chapter_url, headers=get_user_agent(), timeout=5)
|
||||
response = requests.get(
|
||||
chapter_url, headers=get_user_agent(), timeout=5)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
chapter_title = soup.select_one('.content h1').get_text()
|
||||
chapter_title = soup.select_one(
|
||||
'.content h1').get_text()
|
||||
print(chapter_title)
|
||||
chapter_content = soup.select_one('div#chaptercontent').get_text().strip()
|
||||
chapter_content = soup.select_one(
|
||||
'div#chaptercontent').get_text().strip()
|
||||
# print('before: '+chapter_content)
|
||||
# # 将所有的<br>标签替换成换行符\n
|
||||
chapter_content = chapter_content.replace(' ', '\n ')
|
||||
chapter_content = chapter_content.replace(
|
||||
' ', '\n ')
|
||||
# chapter_content = chapter_content.replace('<br>', '\n')
|
||||
content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content,
|
||||
flags=re.MULTILINE)
|
||||
|
@ -216,14 +230,16 @@ class Download_Novel:
|
|||
convert_status = True
|
||||
if convert_type == 0:
|
||||
print(self.file_path, self.download_path + self.title + '.txt')
|
||||
convert_status = self.merge_txt_file(self.download_path + self.title + '.txt')
|
||||
convert_status = self.merge_txt_file(
|
||||
self.download_path + self.title + '.txt')
|
||||
|
||||
elif convert_type == 1:
|
||||
txt_files = []
|
||||
for n in range(0, len(self.chapter_urls)):
|
||||
txt_files.append(self.file_path + str(n) + '.txt')
|
||||
|
||||
convert_status = self.merge_txt_to_epub(txt_files, self.download_path + self.title + '.epub')
|
||||
convert_status = self.merge_txt_to_epub(
|
||||
txt_files, self.download_path + self.title + '.epub')
|
||||
|
||||
if convert_status:
|
||||
print('合并成功!')
|
||||
|
@ -231,8 +247,8 @@ class Download_Novel:
|
|||
print('合并失败!请删除downloads下面目录后重新运行程序!')
|
||||
exit(1)
|
||||
|
||||
|
||||
# 合并为txt文件
|
||||
|
||||
def merge_txt_file(self, merged_file_name=''):
|
||||
"""
|
||||
|
||||
|
@ -242,7 +258,8 @@ class Download_Novel:
|
|||
# os.chdir(file_path)
|
||||
if os.path.exists(merged_file_name):
|
||||
os.remove(merged_file_name)
|
||||
print('merge file : ', sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0])))
|
||||
print('merge file : ', sorted(os.listdir(self.file_path),
|
||||
key=lambda x: int(x.split('.')[0])))
|
||||
time.sleep(self.interval)
|
||||
|
||||
with open(merged_file_name, 'wb') as outfile:
|
||||
|
@ -306,7 +323,8 @@ class Download_Novel:
|
|||
content = file.readlines()
|
||||
try:
|
||||
# 将所有换行符替换为<br>
|
||||
content = [s.replace('\n', '') for s in content if len(s.strip()) > 0]
|
||||
content = [s.replace('\n', '')
|
||||
for s in content if len(s.strip()) > 0]
|
||||
|
||||
# 获取章节标题
|
||||
chapter_title = content[0]
|
||||
|
@ -319,18 +337,19 @@ class Download_Novel:
|
|||
for j, line in enumerate(content[1:]):
|
||||
content[j + 1] = '<p class="calibre3">' + line + '</p>\n'
|
||||
|
||||
|
||||
# content.append('</body></html>')
|
||||
except IndexError as e:
|
||||
print(e)
|
||||
return False
|
||||
# 创建一个章节对象
|
||||
|
||||
chapter = epub.EpubHtml(title=chapter_title, file_name='text/' + str(i) + '.xhtml')
|
||||
chapter = epub.EpubHtml(
|
||||
title=chapter_title, file_name='text/' + str(i) + '.xhtml')
|
||||
chapter.content = ''.join(content) # 将整个文件内容作为章节内容
|
||||
# 下面的是将css文件引用到单个章节里面
|
||||
# page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
|
||||
page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()
|
||||
page_style1 = open('./css/page_styles1.css',
|
||||
'r', encoding='utf-8').read()
|
||||
style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
|
||||
# chapter.add_item(
|
||||
# epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css",
|
||||
|
@ -345,7 +364,8 @@ class Download_Novel:
|
|||
# 将章节添加到书籍中
|
||||
book.add_item(chapter)
|
||||
book.spine.append(chapter)
|
||||
book.toc.append(epub.Link('text/' + str(i) + '.xhtml', chapter_title, str(i)))
|
||||
book.toc.append(epub.Link('text/' + str(i) +
|
||||
'.xhtml', chapter_title, str(i)))
|
||||
# print('xxxxxxxx:','text/' + str(i) + '.xhtml', chapter_title, str(i))
|
||||
|
||||
# 将目录添加到书籍中
|
||||
|
@ -354,11 +374,11 @@ class Download_Novel:
|
|||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
|
||||
|
||||
# 设置书籍的样式文件
|
||||
page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
|
||||
page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()
|
||||
page_style = open('./css/page_styles.css',
|
||||
'r', encoding='utf-8').read()
|
||||
page_style1 = open('./css/page_styles1.css',
|
||||
'r', encoding='utf-8').read()
|
||||
|
||||
style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
|
||||
book.add_item(
|
||||
|
@ -374,7 +394,7 @@ class Download_Novel:
|
|||
epub.write_epub(epub_file, book, {})
|
||||
return True
|
||||
|
||||
def __init__(self, name):
|
||||
def __init__(self, name=None,search_url=None):
|
||||
self.file_path = None
|
||||
self.chapter_urls = None
|
||||
self.cover = None
|
||||
|
@ -383,6 +403,7 @@ class Download_Novel:
|
|||
self.author = None
|
||||
self.title = None
|
||||
self.name = name
|
||||
self.search_url=search_url
|
||||
|
||||
# 定义请求间隔时间(秒)
|
||||
self.interval = 2
|
||||
|
@ -392,11 +413,20 @@ class Download_Novel:
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
search_name = input('请输入要搜索的书籍名称: ')
|
||||
if search_name:
|
||||
download_novel = Download_Novel(search_name)
|
||||
response = download_novel.search_novel()
|
||||
print(response)
|
||||
search_type=input('请选择你要下载的方式(0 or 1):\n0) 使用名称搜索\n1) 直接输入url(格式如:https://www.bqg221.com/biquge/17931/)\n')
|
||||
|
||||
# if isinstance(search_type, int) and 0 <= search_type <= 1:
|
||||
# download_novel
|
||||
download_novel = Download_Novel()
|
||||
if search_type == str(0):
|
||||
search_name = input('请输入要搜索的书籍名称: ')
|
||||
download_novel.name=search_name
|
||||
else:
|
||||
download_novel.search_url=search_type
|
||||
|
||||
response = download_novel.search_novel()
|
||||
# print(response)
|
||||
if download_novel.name!=None:
|
||||
print('搜索到 ' + str(len(response)) + ' 个结果\n')
|
||||
print('---------------------------------------\n')
|
||||
for i, book in enumerate(reversed(response)):
|
||||
|
@ -404,13 +434,25 @@ if __name__ == '__main__':
|
|||
'intro'] + '...\n')
|
||||
print('---------------------------------------')
|
||||
print('---------------------------------------\n')
|
||||
select_book = int(input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择):'))
|
||||
select_book = int(
|
||||
input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择):'))
|
||||
# 判断输入合法
|
||||
if isinstance(select_book, int) and 0 <= select_book <= len(response):
|
||||
download_novel.get_novel_info(response[select_book])
|
||||
download_novel.download_process()
|
||||
else:
|
||||
print('输入内容不合法!')
|
||||
|
||||
else:
|
||||
exit(0)
|
||||
# print('---------------------------------------\n')
|
||||
# # for i, book in enumerate(reversed(response)):
|
||||
# print( ' 书籍名称:' + response['articlename'] + '\n作者:' + response['author'] + '\n简介:' + response[
|
||||
# 'intro'] + '...\n')
|
||||
# print('---------------------------------------')
|
||||
# print('---------------------------------------\n')
|
||||
|
||||
download_novel.get_novel_info()
|
||||
download_novel.download_process()
|
||||
|
||||
|
||||
# else:
|
||||
# exit(0)
|
||||
|
|
|
@ -7,3 +7,4 @@
|
|||
- [x] 实现合并为epub
|
||||
- [x] 实现搜索功能
|
||||
- [ ] 实现多个书源替换
|
||||
- [x] 实现直接输入网址下载
|
36
pdm.lock
36
pdm.lock
|
@ -1,6 +1,23 @@
|
|||
# This file is @generated by PDM.
|
||||
# It is not intended for manual editing.
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.12.2"
|
||||
requires_python = ">=3.6.0"
|
||||
summary = "Screen-scraping library"
|
||||
dependencies = [
|
||||
"soupsieve>1.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bs4"
|
||||
version = "0.0.1"
|
||||
summary = "Screen-scraping library"
|
||||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2023.5.7"
|
||||
|
@ -58,6 +75,12 @@ version = "1.16.0"
|
|||
requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||
summary = "Python 2 and 3 compatibility utilities"
|
||||
|
||||
[[package]]
|
||||
name = "soupsieve"
|
||||
version = "2.4.1"
|
||||
requires_python = ">=3.7"
|
||||
summary = "A modern CSS selector implementation for Beautiful Soup."
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.0.3"
|
||||
|
@ -68,9 +91,16 @@ summary = "HTTP library with thread-safe connection pooling, file post, and more
|
|||
lock_version = "4.2"
|
||||
cross_platform = true
|
||||
groups = ["default"]
|
||||
content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff297544cb6"
|
||||
content_hash = "sha256:cecd9231f4ed5227cc9f2c8a4225b2e84fc47b083042291cb5e26d8bc24a7199"
|
||||
|
||||
[metadata.files]
|
||||
"beautifulsoup4 4.12.2" = [
|
||||
{url = "https://files.pythonhosted.org/packages/57/f4/a69c20ee4f660081a7dedb1ac57f29be9378e04edfcb90c526b923d4bebc/beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
|
||||
{url = "https://files.pythonhosted.org/packages/af/0b/44c39cf3b18a9280950ad63a579ce395dda4c32193ee9da7ff0aed547094/beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
|
||||
]
|
||||
"bs4 0.0.1" = [
|
||||
{url = "https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
|
||||
]
|
||||
"certifi 2023.5.7" = [
|
||||
{url = "https://files.pythonhosted.org/packages/93/71/752f7a4dd4c20d6b12341ed1732368546bc0ca9866139fe812f6009d9ac7/certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"},
|
||||
{url = "https://files.pythonhosted.org/packages/9d/19/59961b522e6757f0c9097e4493fa906031b95b3ebe9360b2c3083561a6b4/certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"},
|
||||
|
@ -250,6 +280,10 @@ content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff29
|
|||
{url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||
{url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
|
||||
]
|
||||
"soupsieve 2.4.1" = [
|
||||
{url = "https://files.pythonhosted.org/packages/47/9e/780779233a615777fbdf75a4dee2af7a345f4bf74b42d4a5f836800b9d91/soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
|
||||
{url = "https://files.pythonhosted.org/packages/49/37/673d6490efc51ec46d198c75903d99de59baffdd47aea3d071b80a9e4e89/soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
|
||||
]
|
||||
"urllib3 2.0.3" = [
|
||||
{url = "https://files.pythonhosted.org/packages/8a/03/ad9306a50d05c166e3456fe810f33cee2b8b2a7a6818ec5d4908c4ec6b36/urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"},
|
||||
{url = "https://files.pythonhosted.org/packages/d6/af/3b4cfedd46b3addab52e84a71ab26518272c23c77116de3c61ead54af903/urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"},
|
||||
|
|
|
@ -10,6 +10,7 @@ dependencies = [
|
|||
"requests>=2.31.0",
|
||||
"ebooklib>=0.18",
|
||||
"setuptools>=68.0.0",
|
||||
"bs4>=0.0.1",
|
||||
]
|
||||
requires-python = ">=3.11"
|
||||
license = {text = "MIT"}
|
||||
|
|
|
@ -35,19 +35,23 @@ class Download_Novel:
|
|||
def search_novel(self):
|
||||
# 定义请求间隔时间(秒)
|
||||
interval = 2
|
||||
hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
|
||||
result = requests.get(hm_url, headers=self.get_user_agent()).text
|
||||
# hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
|
||||
# result = requests.get(hm_url, headers=self.get_user_agent()).text
|
||||
# print(result)
|
||||
hm = result[2:-2]
|
||||
# hm = result[2:-2]
|
||||
# print(hm)
|
||||
# 发起请求并获取响应
|
||||
url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}'
|
||||
|
||||
url = f'https://www.bqg221.com/user/search.html?q={self.name}'
|
||||
print(url)
|
||||
print('响应内容:',requests.get(url, headers=self.get_user_agent()).text)
|
||||
response = json.loads(requests.get(url, headers=self.get_user_agent()).text[1:-1])
|
||||
print(response)
|
||||
# print(type(response))
|
||||
for i, book in enumerate(response):
|
||||
# i['url_list'][:9] = 'https://www'
|
||||
trans_url = book['url_list'].replace('https://m', 'https://www')
|
||||
# trans_url = book['url_list'].replace('https://m', 'https://www')
|
||||
print(type(book['url_list']))
|
||||
trans_url ='https://www.bqg221.com' + str(book['url_list'])
|
||||
response[i]['url_list'] = trans_url
|
||||
|
||||
# 返回一个json对象
|
||||
|
|
Loading…
Reference in a new issue