实现搜索功能

This commit is contained in:
liyp 2023-06-21 15:44:42 +08:00
parent 9d75014e58
commit fd3bd8656e
6 changed files with 368 additions and 235 deletions

4
.gitignore vendored
View file

@ -161,6 +161,4 @@ cython_debug/
.pdm-python .pdm-python
/test_get_txt.py /test_get_txt.py
/test_tmp.py /test_tmp.py
/深空彼岸/ test_search.py
/深空彼岸.epub
/斗破苍穹.epub

View file

@ -1,14 +1,14 @@
import io import io
import json
import os import os
import random import random
import re
import shutil import shutil
import threading import threading
import time import time
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import requests
from ebooklib import epub from ebooklib import epub
@ -37,63 +37,110 @@ def get_user_agent():
return {'User-Agent': user_agent} return {'User-Agent': user_agent}
# 定义请求间隔时间(秒) class Download_Novel:
interval = 2
# 设置请求头,模拟浏览器访问 def search_novel(self):
hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
result = requests.get(hm_url, headers=get_user_agent()).text
# print(result)
hm = result[2:-2]
# print(hm)
# 发起请求并获取响应
url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}'
# 要爬取的小说主页链接 response = json.loads(requests.get(url, headers=get_user_agent()).text[1:-1])
url = 'https://www.bqg221.com/xs/17931/' # print(type(response))
for i, book in enumerate(response):
# i['url_list'][:9] = 'https://www'
trans_url = book['url_list'].replace('https://m', 'https://www')
response[i]['url_list'] = trans_url
# 发起请求并获取响应 # 返回一个json对象
response = requests.get(url, headers=get_user_agent()) return response
# 将响应转换为BeautifulSoup对象 def get_novel_info(self, response):
soup = BeautifulSoup(response.text, 'html.parser')
# 获取小说名字 # 定义请求间隔时间(秒)
title = soup.select_one('.book h1').get_text(strip=True) interval = 2
print(title)
# 获取小说简介 # 设置请求头,模拟浏览器访问
# print(soup.select('.small')[0])
div_tag = soup.find('div', {'class': 'small'})
# print(div_tag)
all_span_tags = div_tag.find_all('span')
# print(all_span_tags)
author = all_span_tags[0].text.strip()[3:]
status = all_span_tags[1].text.strip()
update_time = all_span_tags[2].text.strip()
latest_update = all_span_tags[3].text.strip()
# for i in all_span_tags:
# print(i.text.strip())
intro = soup.select_one('.intro').get_text(strip=True)[:-6]
print(intro)
cover = soup.select_one('.cover img')['src'] # 要爬取的小说主页链接
# print(cover) url = response['url_list']
# 获取小说所有章节链接
chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if # 发起请求并获取响应
url_response = requests.get(url, headers=get_user_agent())
# 将响应转换为BeautifulSoup对象
soup = BeautifulSoup(url_response.text, 'html.parser')
# 获取小说名字
# title = soup.select_one('.book h1').get_text(strip=True)
self.title = response['articlename']
print(self.title)
# 获取小说简介
# print(soup.select('.small')[0])
div_tag = soup.find('div', {'class': 'small'})
# print(div_tag)
all_span_tags = div_tag.find_all('span')
# print(all_span_tags)
# author = all_span_tags[0].text.strip()[3:]
self.author = response['author']
self.status = all_span_tags[1].text.strip()
self.update_time = all_span_tags[2].text.strip()
self.latest_update = all_span_tags[3].text.strip()
# for i in all_span_tags:
# print(i.text.strip())
self.intro = soup.select_one('.intro').get_text(strip=True)[:-6]
print(self.intro)
# cover = soup.select_one('.cover img')['src']
self.cover = response['url_img']
# print(cover)
# 获取小说所有章节链接
self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
i.get('href').split('/')[-1] != 'javascript:dd_show()'] i.get('href').split('/')[-1] != 'javascript:dd_show()']
# print(chapter_urls) # print(chapter_urls)
print('开始下载。。。') print('开始下载。。。')
# 停顿两秒
time.sleep(interval)
# 获取当前文件所在目录路径
dir_path = os.path.dirname(os.path.abspath(__file__))
# 多线程下载txt self.download_path = dir_path + '/downloads/'
def download_url(chapter_url, file_name): self.file_path = self.download_path + self.title + '/'
# 限制下载线程数 if not os.path.exists(self.file_path):
with semaphore: os.makedirs(self.file_path)
file_path = './' + title
file_name = file_path + '/' + file_name
if not os.path.exists(file_path): # 停顿两秒
os.makedirs(file_path) time.sleep(self.interval)
print('文件夹不存在,创建文件夹')
def get_multi_txt_file_status(self, file_name):
file_name = self.file_path + file_name
if os.path.exists(file_name) and os.path.getsize(file_name) > 0: if os.path.exists(file_name) and os.path.getsize(file_name) > 0:
print(file_name + ' 已存在,跳过...\n')
return file_name, True
else:
return file_name, False
def download_url(self, chapter_url, file_name):
# 限制下载线程数
with self.semaphore:
# 获取当前文件所在目录路径
# dir_path = os.path.dirname(os.path.abspath(__file__))
#
# file_path = dir_path + '/downloads/' + self.title
# file_name = file_path + '/' + file_name
#
# if not os.path.exists(file_path):
# os.makedirs(file_path)
# # print('文件夹不存在,创建文件夹')
file_name, status = self.get_multi_txt_file_status(file_name=file_name)
if status:
print(file_name + ' 已存在,跳过...\n') print(file_name + ' 已存在,跳过...\n')
# success_account =+ 1 # success_account =+ 1
else: else:
@ -112,7 +159,8 @@ def download_url(chapter_url, file_name):
# # 将所有的<br>标签替换成换行符\n # # 将所有的<br>标签替换成换行符\n
chapter_content = chapter_content.replace('  ', '\n ') chapter_content = chapter_content.replace('  ', '\n ')
# chapter_content = chapter_content.replace('<br>', '\n') # chapter_content = chapter_content.replace('<br>', '\n')
content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content, flags=re.MULTILINE) content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content,
flags=re.MULTILINE)
# print(content) # print(content)
# 将处理后的结果写入到test.txt文件中 # 将处理后的结果写入到test.txt文件中
f.write(chapter_title + '\n' + content + '\n\n') f.write(chapter_title + '\n' + content + '\n\n')
@ -121,13 +169,103 @@ def download_url(chapter_url, file_name):
# return True # return True
except Exception as e: except Exception as e:
print(e, '\n retry...') print(e, '\n retry...')
time.sleep(interval) time.sleep(self.interval)
retry -= 1 retry -= 1
# return False # return False
def multi_thread_download(self):
self.threads = []
for file_name, chapter_url in enumerate(self.chapter_urls):
args = (chapter_url, str(file_name) + '.txt')
thread = threading.Thread(target=self.download_url, args=args)
self.threads.append(thread)
thread.start()
# txt转换为epub for thread in self.threads:
def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction=''): thread.join()
def download_process(self):
# 限制同时4线程建议使用4线程过多线程会导致错误增多
max_concurrent_threads = 4
# 创建Semaphore对象并将其初始值设置为max_concurrent_threads
self.semaphore = threading.Semaphore(max_concurrent_threads)
self.multi_thread_download()
time.sleep(self.interval)
file = 0
# 判断是否全部下载成功
for i in range(0, len(self.chapter_urls)):
status = self.get_multi_txt_file_status(str(i) + '.txt')[1]
if not status:
file += 1
break
if not file:
convert_type = int(input('下载成功!\n请输入要合并的格式:\n0 TxT文件\n1 Epub文件\n'))
convert_status = True
if convert_type == 0:
print(self.file_path, self.download_path + self.title + '.txt')
convert_status = self.merge_txt_file(self.download_path + self.title + '.txt')
elif convert_type == 1:
txt_files = []
for n in range(0, len(self.chapter_urls)):
txt_files.append(self.file_path + str(n) + '.txt')
# print('txt_files:',txt_files)
convert_status = self.merge_txt_to_epub(txt_files, self.download_path + self.title + '.epub')
if convert_status:
print('合并成功!')
else:
print('合并失败请删除downloads下面目录后重新运行程序')
exit(1)
else:
print('部分文件下载失败,限制线程数可以提高下载成功率,是否重新下载个别文件?')
download = input('0 退出\n1 重试\n')
if download == 0:
exit(0)
else:
self.download_process()
# 合并为txt文件
def merge_txt_file(self, merged_file_name=''):
"""
:param merged_file_name: 合并后文件保存位置
:returns bool: 返回合并成功或者失败状态
"""
# os.chdir(file_path)
if os.path.exists(merged_file_name):
os.remove(merged_file_name)
print('merge file : ', sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0])))
time.sleep(self.interval)
with open(merged_file_name, 'wb') as outfile:
try:
for filename in sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0])):
print(filename)
if filename.endswith('.txt'):
# 判断文件是否为空
if os.path.exists(self.file_path + '/' + filename) and os.path.getsize(
self.file_path + '/' + filename) > 0:
# print(filename + ' 已存在,跳过...\n')
with open(self.file_path + '/' + filename, 'rb') as infile:
shutil.copyfileobj(infile, outfile)
else:
return False
except Exception as e:
os.remove(merged_file_name)
print(e)
return False
return True
# txt转换为epub
def merge_txt_to_epub(self, txt_files, epub_file):
""" """
将txt转换为epub 将txt转换为epub
@ -135,14 +273,8 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
epub_file (str) 实际为转换成功的epub文件路径及名称 epub_file (str) 实际为转换成功的epub文件路径及名称
author (str) 作者
cover (str) 封面图片链接
direction (str) 书籍简介
""" """
# 创建EPUB书籍对象 # 创建EPUB书籍对象
book = epub.EpubBook() book = epub.EpubBook()
@ -150,23 +282,24 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
os.remove(epub_file) os.remove(epub_file)
# 设置元数据(可根据需要进行调整) # 设置元数据(可根据需要进行调整)
book.set_title(title) book.set_title(self.title)
book.set_language('zh') book.set_language('zh')
book.add_author(author) book.add_author(self.author)
book.set_direction(direction) book.set_direction(self.intro)
# 添加封面 # 添加封面
# 获取图片并将其转换为字节流 # 获取图片并将其转换为字节流
response = requests.get(cover) response = requests.get(self.cover)
stream = io.BytesIO(response.content) stream = io.BytesIO(response.content)
book.set_cover('cover.jpg', stream.getvalue(), 'image/jpeg') book.set_cover('cover.jpg', stream.getvalue(), 'image/jpeg')
print('合并中。。。。。。') print('合并中。。。。。。')
# print(txt_files)
# 书籍目录 # 书籍目录
book_spine = [] book_spine = []
# 遍历所有txt文件 # 遍历所有txt文件
os.chdir(title) # os.chdir(title)
for i, txt_file in enumerate(txt_files): for i, txt_file in enumerate(txt_files):
# 读取txt文件内容 # 读取txt文件内容
with open(txt_file, 'r', encoding='utf-8') as file: with open(txt_file, 'r', encoding='utf-8') as file:
@ -184,7 +317,7 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
content[0] = f""" <div class="calibre2" id="calibre_pb_0"></div><h1 class="kindle-cn-heading" id="calibre_pb_1"> content[0] = f""" <div class="calibre2" id="calibre_pb_0"></div><h1 class="kindle-cn-heading" id="calibre_pb_1">
{content[0]} </h1> """ {content[0]} </h1> """
for j, line in enumerate(content[1:]): for j, line in enumerate(content[1:]):
content[j + 1] = '<p class="calibre3">' + line + '</p>' content[j + 1] = '<p class="calibre3">' + line + '</p>\n'
# content.append('</body></html>') # content.append('</body></html>')
except IndexError as e: except IndexError as e:
print(e) print(e)
@ -194,9 +327,9 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
chapter = epub.EpubHtml(title=chapter_title, file_name='text/' + str(i) + '.xhtml') chapter = epub.EpubHtml(title=chapter_title, file_name='text/' + str(i) + '.xhtml')
chapter.content = ''.join(content) # 将整个文件内容作为章节内容 chapter.content = ''.join(content) # 将整个文件内容作为章节内容
# 下面的是将css文件引用到单个章节里面 # 下面的是将css文件引用到单个章节里面
page_style = open('../css/page_styles.css', 'r', encoding='utf-8').read() page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
page_style1 = open('../css/page_styles1.css', 'r', encoding='utf-8').read() page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()
style = open('../css/stylesheet.css', 'r', encoding='utf-8').read() style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
chapter.add_item( chapter.add_item(
epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css", epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css",
content=page_style)) content=page_style))
@ -219,13 +352,13 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
book.toc = book_spine book.toc = book_spine
# 设置书籍的样式文件 # 设置书籍的样式文件
os.chdir('../')
page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read() page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read() page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()
style = open('./css/stylesheet.css', 'r', encoding='utf-8').read() style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
book.add_item( book.add_item(
epub.EpubItem(uid="page_style", file_name="style/page_styles.css", media_type="text/css", content=page_style)) epub.EpubItem(uid="page_style", file_name="style/page_styles.css", media_type="text/css",
content=page_style))
book.add_item( book.add_item(
epub.EpubItem(uid="page_style1", file_name="style/page_styles1.css", media_type="text/css", epub.EpubItem(uid="page_style1", file_name="style/page_styles1.css", media_type="text/css",
content=page_style1)) content=page_style1))
@ -233,72 +366,46 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
epub.EpubItem(uid="style_default", file_name="style/stylesheet.css", media_type="text/css", content=style)) epub.EpubItem(uid="style_default", file_name="style/stylesheet.css", media_type="text/css", content=style))
# 打包EPUB文件 # 打包EPUB文件
epub.write_epub('./' + epub_file, book, {}) epub.write_epub(epub_file, book, {})
return True return True
def __init__(self, name):
self.file_path = None
self.chapter_urls = None
self.cover = None
self.intro = None
self.status = None
self.author = None
self.title = None
self.name = name
# 合并为txt文件 # 定义请求间隔时间(秒)
def merge_txt_file(file_path='', merged_file_name=''): self.interval = 2
"""
:param file_path: txt文件的保存位置 # 要爬取的小说主页链接
:param merged_file_name: 合并后文件保存位置 # self.novel_url = 'https://www.bqg221.com/xs/'
:returns bool: 返回合并成功或者失败状态
"""
os.chdir(file_path) if __name__ == '__main__':
if os.path.exists(merged_file_name): search_name = input('请输入要搜索的书籍名称: ')
os.rmdir(merged_file_name) if search_name:
print('merge file : ', sorted(os.listdir('.'), key=lambda x: int(x.split('.')[0]))) download_novel = Download_Novel(search_name)
with open(merged_file_name, 'wb') as outfile: response = download_novel.search_novel()
for filename in sorted(os.listdir('.'), key=lambda x: int(x.split('.')[0])): print(response)
print(filename) print('搜索到 ' + str(len(response)) + ' 个结果\n')
if filename.endswith('.txt'): print('---------------------------------------\n')
# 判断文件是否为空 for i, book in enumerate(response):
if os.path.exists(filename) and os.path.getsize(filename) > 0: print(str(i) + ' 书籍名称:' + book['articlename'] + '\n作者:' + book['author'] + '\n简介:' + book[
# print(filename + ' 已存在,跳过...\n') 'intro'] + '...\n')
with open(filename, 'rb') as infile: print('---------------------------------------')
shutil.copyfileobj(infile, outfile) print('---------------------------------------\n')
select_book = int(input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择)'))
# 判断输入合法
if isinstance(select_book, int) and 0 <= select_book <= len(response):
download_novel.get_novel_info(response[select_book])
download_novel.download_process()
else:
print('输入内容不合法!')
else: else:
return False exit(0)
return True
def multi_thread_download():
threads = []
for file_name, chapter_url in enumerate(chapter_urls):
args = (chapter_url, str(file_name) + '.txt')
thread = threading.Thread(target=download_url, args=args)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
# 限制同时4线程建议使用4线程过多线程会导致错误增多
max_concurrent_threads = 4
# 创建Semaphore对象并将其初始值设置为max_concurrent_threads
semaphore = threading.Semaphore(max_concurrent_threads)
multi_thread_download()
time.sleep(interval)
while True:
# merge_txt_file('./' + title, '../' + title + '.txt')
# 调用函数进行合并
txt_files = sorted(os.listdir(title), key=lambda x: int(x.split('.')[0]))
epub_file_path = title + '.epub'
result = merge_txt_to_epub(txt_files, epub_file_path, author, cover, intro)
if not result:
print('下载失败:', result, '\t是否重试?')
num = int(input('0 重试\n1 退出\n'))
if num == 0:
multi_thread_download()
merge_txt_to_epub(txt_files, epub_file_path, author, cover, intro)
else:
break
else:
print('合并成功!')
break

9
README.md Normal file
View file

@ -0,0 +1,9 @@
## Novel_Download
`https://www.bqg221.com` 获取小说内容将爬取的内容合并为txt或者epub格式。
- 现已实现的功能
- [x] 实现下载txt文档
- [x] 实现合并txt
- [x] 实现合并为epub
- [x] 实现搜索功能
- [ ] 实现多个书源替换

View file

@ -2,7 +2,15 @@
margin-bottom: 5pt; margin-bottom: 5pt;
margin-top: 5pt margin-top: 5pt
} }
@font-face { html {
font-family: yinbiao; overflow: hidden;
src: url(styles/XXXXXXXXXXXXXXXX) }
}
body {
height: 100%;
overflow: auto;
-webkit-column-width: 100%;
-moz-column-width: 100%;
column-width: 100%;
page-break-inside: avoid;
}

View file

@ -46,6 +46,12 @@ dependencies = [
"urllib3<3,>=1.21.1", "urllib3<3,>=1.21.1",
] ]
[[package]]
name = "setuptools"
version = "68.0.0"
requires_python = ">=3.7"
summary = "Easily download, build, install, upgrade, and uninstall Python packages"
[[package]] [[package]]
name = "six" name = "six"
version = "1.16.0" version = "1.16.0"
@ -62,7 +68,7 @@ summary = "HTTP library with thread-safe connection pooling, file post, and more
lock_version = "4.2" lock_version = "4.2"
cross_platform = true cross_platform = true
groups = ["default"] groups = ["default"]
content_hash = "sha256:2fff54024aa639561573351b1841dd1f0084ca345b52d35859bfae33744350e8" content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff297544cb6"
[metadata.files] [metadata.files]
"certifi 2023.5.7" = [ "certifi 2023.5.7" = [
@ -236,6 +242,10 @@ content_hash = "sha256:2fff54024aa639561573351b1841dd1f0084ca345b52d35859bfae337
{url = "https://files.pythonhosted.org/packages/70/8e/0e2d847013cb52cd35b38c009bb167a1a26b2ce6cd6965bf26b47bc0bf44/requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, {url = "https://files.pythonhosted.org/packages/70/8e/0e2d847013cb52cd35b38c009bb167a1a26b2ce6cd6965bf26b47bc0bf44/requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
{url = "https://files.pythonhosted.org/packages/9d/be/10918a2eac4ae9f02f6cfe6414b7a155ccd8f7f9d4380d62fd5b955065c3/requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, {url = "https://files.pythonhosted.org/packages/9d/be/10918a2eac4ae9f02f6cfe6414b7a155ccd8f7f9d4380d62fd5b955065c3/requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
] ]
"setuptools 68.0.0" = [
{url = "https://files.pythonhosted.org/packages/c7/42/be1c7bbdd83e1bfb160c94b9cafd8e25efc7400346cf7ccdbdb452c467fa/setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"},
{url = "https://files.pythonhosted.org/packages/dc/98/5f896af066c128669229ff1aa81553ac14cfb3e5e74b6b44594132b8540e/setuptools-68.0.0.tar.gz", hash = "sha256:baf1fdb41c6da4cd2eae722e135500da913332ab3f2f5c7d33af9b492acb5235"},
]
"six 1.16.0" = [ "six 1.16.0" = [
{url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
{url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},

View file

@ -9,6 +9,7 @@ authors = [
dependencies = [ dependencies = [
"requests>=2.31.0", "requests>=2.31.0",
"ebooklib>=0.18", "ebooklib>=0.18",
"setuptools>=68.0.0",
] ]
requires-python = ">=3.11" requires-python = ">=3.11"
license = {text = "MIT"} license = {text = "MIT"}