实现搜索功能

2023-06-21 15:44:42 +08:00 · 2023-06-21 15:44:42 +08:00 · fd3bd8656e
commit fd3bd8656e
parent 9d75014e58
6 changed files with 368 additions and 235 deletions
--- a/.gitignore
+++ b/.gitignore
@ -161,6 +161,4 @@ cython_debug/
 .pdm-python
 /test_get_txt.py
 /test_tmp.py
-/深空彼岸/
-/深空彼岸.epub
-/斗破苍穹.epub
+test_search.py
--- a/Download_Novel.py
+++ b/Download_Novel.py
@ -1,14 +1,14 @@
 import io
+import json
 import os
 import random
+import re
 import shutil
 import threading
 import time

-import requests
 from bs4 import BeautifulSoup
-import re
-
+import requests
 from ebooklib import epub


@ -37,63 +37,110 @@ def get_user_agent():
    return {'User-Agent': user_agent}


-# 定义请求间隔时间（秒）
-interval = 2
+class Download_Novel:

-# 设置请求头，模拟浏览器访问
+    def search_novel(self):
+        hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
+        result = requests.get(hm_url, headers=get_user_agent()).text
+        # print(result)
+        hm = result[2:-2]
+        # print(hm)
+        # 发起请求并获取响应
+        url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}'

-# 要爬取的小说主页链接
-url = 'https://www.bqg221.com/xs/17931/'
+        response = json.loads(requests.get(url, headers=get_user_agent()).text[1:-1])
+        # print(type(response))
+        for i, book in enumerate(response):
+            # i['url_list'][:9] = 'https://www'
+            trans_url = book['url_list'].replace('https://m', 'https://www')
+            response[i]['url_list'] = trans_url

-# 发起请求并获取响应
-response = requests.get(url, headers=get_user_agent())
+        # 返回一个json对象
+        return response

-# 将响应转换为BeautifulSoup对象
-soup = BeautifulSoup(response.text, 'html.parser')
+    def get_novel_info(self, response):

-# 获取小说名字
-title = soup.select_one('.book h1').get_text(strip=True)
-print(title)
+        # 定义请求间隔时间（秒）
+        interval = 2

-# 获取小说简介
-# print(soup.select('.small')[0])
-div_tag = soup.find('div', {'class': 'small'})
-# print(div_tag)
-all_span_tags = div_tag.find_all('span')
-# print(all_span_tags)
-author = all_span_tags[0].text.strip()[3:]
-status = all_span_tags[1].text.strip()
-update_time = all_span_tags[2].text.strip()
-latest_update = all_span_tags[3].text.strip()
-# for i in all_span_tags:
-#     print(i.text.strip())
-intro = soup.select_one('.intro').get_text(strip=True)[:-6]
-print(intro)
+        # 设置请求头，模拟浏览器访问

-cover = soup.select_one('.cover img')['src']
-# print(cover)
-# 获取小说所有章节链接
-chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
+        # 要爬取的小说主页链接
+        url = response['url_list']
+
+        # 发起请求并获取响应
+        url_response = requests.get(url, headers=get_user_agent())
+
+        # 将响应转换为BeautifulSoup对象
+        soup = BeautifulSoup(url_response.text, 'html.parser')
+
+        # 获取小说名字
+        # title = soup.select_one('.book h1').get_text(strip=True)
+        self.title = response['articlename']
+        print(self.title)
+
+        # 获取小说简介
+        # print(soup.select('.small')[0])
+        div_tag = soup.find('div', {'class': 'small'})
+        # print(div_tag)
+        all_span_tags = div_tag.find_all('span')
+        # print(all_span_tags)
+        # author = all_span_tags[0].text.strip()[3:]
+        self.author = response['author']
+        self.status = all_span_tags[1].text.strip()
+        self.update_time = all_span_tags[2].text.strip()
+        self.latest_update = all_span_tags[3].text.strip()
+        # for i in all_span_tags:
+        #     print(i.text.strip())
+        self.intro = soup.select_one('.intro').get_text(strip=True)[:-6]
+        print(self.intro)
+
+        # cover = soup.select_one('.cover img')['src']
+        self.cover = response['url_img']
+        # print(cover)
+        # 获取小说所有章节链接
+        self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
                             i.get('href').split('/')[-1] != 'javascript:dd_show()']
-# print(chapter_urls)
+        # print(chapter_urls)

-print('开始下载。。。')
-# 停顿两秒
-time.sleep(interval)
+        print('开始下载。。。')

+        # 获取当前文件所在目录路径
+        dir_path = os.path.dirname(os.path.abspath(__file__))

-# 多线程下载txt
-def download_url(chapter_url, file_name):
-    # 限制下载线程数
-    with semaphore:
-        file_path = './' + title
-        file_name = file_path + '/' + file_name
+        self.download_path = dir_path + '/downloads/'
+        self.file_path = self.download_path + self.title + '/'
+        if not os.path.exists(self.file_path):
+            os.makedirs(self.file_path)

-        if not os.path.exists(file_path):
-            os.makedirs(file_path)
-            print('文件夹不存在，创建文件夹')
+        # 停顿两秒
+        time.sleep(self.interval)
+
+    def get_multi_txt_file_status(self, file_name):
+
+        file_name = self.file_path + file_name

        if os.path.exists(file_name) and os.path.getsize(file_name) > 0:
+            print(file_name + ' 已存在，跳过...\n')
+            return file_name, True
+        else:
+            return file_name, False
+
+    def download_url(self, chapter_url, file_name):
+        # 限制下载线程数
+        with self.semaphore:
+            # 获取当前文件所在目录路径
+            # dir_path = os.path.dirname(os.path.abspath(__file__))
+            #
+            # file_path = dir_path + '/downloads/' + self.title
+            # file_name = file_path + '/' + file_name
+            #
+            # if not os.path.exists(file_path):
+            #     os.makedirs(file_path)
+            #     # print('文件夹不存在，创建文件夹')
+            file_name, status = self.get_multi_txt_file_status(file_name=file_name)
+
+            if status:
                print(file_name + ' 已存在，跳过...\n')
                # success_account =+ 1
            else:
@ -112,7 +159,8 @@ def download_url(chapter_url, file_name):
                            # # 将所有的<br>标签替换成换行符\n
                            chapter_content = chapter_content.replace('　　', '\n ')
                            # chapter_content = chapter_content.replace('<br>', '\n')
-                        content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content, flags=re.MULTILINE)
+                            content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content,
+                                             flags=re.MULTILINE)
                            # print(content)
                            # 将处理后的结果写入到test.txt文件中
                            f.write(chapter_title + '\n' + content + '\n\n')
@ -121,13 +169,103 @@ def download_url(chapter_url, file_name):
                            # return True
                        except Exception as e:
                            print(e, '\n retry...')
-                        time.sleep(interval)
+                            time.sleep(self.interval)
                            retry -= 1
                # return False

+    def multi_thread_download(self):
+        self.threads = []
+        for file_name, chapter_url in enumerate(self.chapter_urls):
+            args = (chapter_url, str(file_name) + '.txt')
+            thread = threading.Thread(target=self.download_url, args=args)
+            self.threads.append(thread)
+            thread.start()

-# txt转换为epub
-def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction=''):
+        for thread in self.threads:
+            thread.join()
+
+    def download_process(self):
+
+        # 限制同时4线程，建议使用4线程，过多线程会导致错误增多
+        max_concurrent_threads = 4
+
+        # 创建Semaphore对象，并将其初始值设置为max_concurrent_threads
+        self.semaphore = threading.Semaphore(max_concurrent_threads)
+
+        self.multi_thread_download()
+
+        time.sleep(self.interval)
+
+        file = 0
+        # 判断是否全部下载成功
+        for i in range(0, len(self.chapter_urls)):
+            status = self.get_multi_txt_file_status(str(i) + '.txt')[1]
+            if not status:
+                file += 1
+                break
+        if not file:
+            convert_type = int(input('下载成功！\n请输入要合并的格式：\n0 TxT文件\n1 Epub文件\n'))
+            convert_status = True
+            if convert_type == 0:
+                print(self.file_path, self.download_path + self.title + '.txt')
+                convert_status = self.merge_txt_file(self.download_path + self.title + '.txt')
+
+            elif convert_type == 1:
+                txt_files = []
+                for n in range(0, len(self.chapter_urls)):
+                    txt_files.append(self.file_path + str(n) + '.txt')
+                # print('txt_files:',txt_files)
+                convert_status = self.merge_txt_to_epub(txt_files, self.download_path + self.title + '.epub')
+
+            if convert_status:
+                print('合并成功！')
+            else:
+                print('合并失败！请删除downloads下面目录后重新运行程序！')
+                exit(1)
+
+        else:
+            print('部分文件下载失败，限制线程数可以提高下载成功率，是否重新下载个别文件？')
+            download = input('0 退出\n1 重试\n')
+            if download == 0:
+                exit(0)
+            else:
+                self.download_process()
+
+    # 合并为txt文件
+    def merge_txt_file(self, merged_file_name=''):
+        """
+
+        :param merged_file_name: 合并后文件保存位置
+        :returns bool: 返回合并成功或者失败状态
+        """
+        # os.chdir(file_path)
+        if os.path.exists(merged_file_name):
+            os.remove(merged_file_name)
+        print('merge file : ', sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0])))
+        time.sleep(self.interval)
+
+        with open(merged_file_name, 'wb') as outfile:
+            try:
+                for filename in sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0])):
+                    print(filename)
+                    if filename.endswith('.txt'):
+                        # 判断文件是否为空
+                        if os.path.exists(self.file_path + '/' + filename) and os.path.getsize(
+                                self.file_path + '/' + filename) > 0:
+                            # print(filename + ' 已存在，跳过...\n')
+                            with open(self.file_path + '/' + filename, 'rb') as infile:
+                                shutil.copyfileobj(infile, outfile)
+
+                        else:
+                            return False
+            except Exception as e:
+                os.remove(merged_file_name)
+                print(e)
+                return False
+        return True
+
+    # txt转换为epub
+    def merge_txt_to_epub(self, txt_files, epub_file):
        """
        将txt转换为epub

@ -135,14 +273,8 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction

        epub_file (str) ：实际为转换成功的epub文件路径及名称

-    author (str) ：作者
-
-    cover (str) ：封面图片链接
-
-    direction (str) ：书籍简介

        """
-
        # 创建EPUB书籍对象
        book = epub.EpubBook()

@ -150,23 +282,24 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
            os.remove(epub_file)

        # 设置元数据（可根据需要进行调整）
-    book.set_title(title)
+        book.set_title(self.title)
        book.set_language('zh')
-    book.add_author(author)
-    book.set_direction(direction)
+        book.add_author(self.author)
+        book.set_direction(self.intro)
        # 添加封面
        # 获取图片并将其转换为字节流
-    response = requests.get(cover)
+        response = requests.get(self.cover)
        stream = io.BytesIO(response.content)

        book.set_cover('cover.jpg', stream.getvalue(), 'image/jpeg')

        print('合并中。。。。。。')
+        # print(txt_files)

        # 书籍目录
        book_spine = []
        # 遍历所有txt文件
-    os.chdir(title)
+        # os.chdir(title)
        for i, txt_file in enumerate(txt_files):
            # 读取txt文件内容
            with open(txt_file, 'r', encoding='utf-8') as file:
@ -184,7 +317,7 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
                content[0] = f""" <div class="calibre2" id="calibre_pb_0"></div><h1 class="kindle-cn-heading" id="calibre_pb_1">
                             {content[0]} </h1> """
                for j, line in enumerate(content[1:]):
-                content[j + 1] = '<p class="calibre3">' + line + '</p>'
+                    content[j + 1] = '<p class="calibre3">' + line + '</p>\n'
                # content.append('</body></html>')
            except IndexError as e:
                print(e)
@ -194,9 +327,9 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
            chapter = epub.EpubHtml(title=chapter_title, file_name='text/' + str(i) + '.xhtml')
            chapter.content = ''.join(content)  # 将整个文件内容作为章节内容
            # 下面的是将css文件引用到单个章节里面
-        page_style = open('../css/page_styles.css', 'r', encoding='utf-8').read()
-        page_style1 = open('../css/page_styles1.css', 'r', encoding='utf-8').read()
-        style = open('../css/stylesheet.css', 'r', encoding='utf-8').read()
+            page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
+            page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()
+            style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
            chapter.add_item(
                epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css",
                              content=page_style))
@ -219,13 +352,13 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
        book.toc = book_spine

        # 设置书籍的样式文件
-    os.chdir('../')
        page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
        page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()

        style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
        book.add_item(
-        epub.EpubItem(uid="page_style", file_name="style/page_styles.css", media_type="text/css", content=page_style))
+            epub.EpubItem(uid="page_style", file_name="style/page_styles.css", media_type="text/css",
+                          content=page_style))
        book.add_item(
            epub.EpubItem(uid="page_style1", file_name="style/page_styles1.css", media_type="text/css",
                          content=page_style1))
@ -233,72 +366,46 @@ def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction
            epub.EpubItem(uid="style_default", file_name="style/stylesheet.css", media_type="text/css", content=style))

        # 打包EPUB文件
-    epub.write_epub('./' + epub_file, book, {})
+        epub.write_epub(epub_file, book, {})
        return True

+    def __init__(self, name):
+        self.file_path = None
+        self.chapter_urls = None
+        self.cover = None
+        self.intro = None
+        self.status = None
+        self.author = None
+        self.title = None
+        self.name = name

-# 合并为txt文件
-def merge_txt_file(file_path='', merged_file_name=''):
-    """
+        # 定义请求间隔时间（秒）
+        self.interval = 2

-    :param file_path: txt文件的保存位置
-    :param merged_file_name: 合并后文件保存位置
-    :returns bool: 返回合并成功或者失败状态
-    """
-    os.chdir(file_path)
-    if os.path.exists(merged_file_name):
-        os.rmdir(merged_file_name)
-    print('merge file : ', sorted(os.listdir('.'), key=lambda x: int(x.split('.')[0])))
-    with open(merged_file_name, 'wb') as outfile:
-        for filename in sorted(os.listdir('.'), key=lambda x: int(x.split('.')[0])):
-            print(filename)
-            if filename.endswith('.txt'):
-                # 判断文件是否为空
-                if os.path.exists(filename) and os.path.getsize(filename) > 0:
-                    # print(filename + ' 已存在，跳过...\n')
-                    with open(filename, 'rb') as infile:
-                        shutil.copyfileobj(infile, outfile)
+        # 要爬取的小说主页链接
+        # self.novel_url = 'https://www.bqg221.com/xs/'
+
+
+if __name__ == '__main__':
+    search_name = input('请输入要搜索的书籍名称： ')
+    if search_name:
+        download_novel = Download_Novel(search_name)
+        response = download_novel.search_novel()
+        print(response)
+        print('搜索到 ' + str(len(response)) + ' 个结果\n')
+        print('---------------------------------------\n')
+        for i, book in enumerate(response):
+            print(str(i) + ' 书籍名称：' + book['articlename'] + '\n作者：' + book['author'] + '\n简介：' + book[
+                'intro'] + '...\n')
+            print('---------------------------------------')
+            print('---------------------------------------\n')
+        select_book = int(input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择)：'))
+        # 判断输入合法
+        if isinstance(select_book, int) and 0 <= select_book <= len(response):
+            download_novel.get_novel_info(response[select_book])
+            download_novel.download_process()
+        else:
+            print('输入内容不合法！')

    else:
-                    return False
-    return True
-
-
-def multi_thread_download():
-    threads = []
-    for file_name, chapter_url in enumerate(chapter_urls):
-        args = (chapter_url, str(file_name) + '.txt')
-        thread = threading.Thread(target=download_url, args=args)
-        threads.append(thread)
-        thread.start()
-
-    for thread in threads:
-        thread.join()
-
-
-# 限制同时4线程，建议使用4线程，过多线程会导致错误增多
-max_concurrent_threads = 4
-# 创建Semaphore对象，并将其初始值设置为max_concurrent_threads
-semaphore = threading.Semaphore(max_concurrent_threads)
-
-multi_thread_download()
-
-time.sleep(interval)
-
-while True:
-    # merge_txt_file('./' + title, '../' + title + '.txt')
-    # 调用函数进行合并
-    txt_files = sorted(os.listdir(title), key=lambda x: int(x.split('.')[0]))
-    epub_file_path = title + '.epub'
-    result = merge_txt_to_epub(txt_files, epub_file_path, author, cover, intro)
-    if not result:
-        print('下载失败：', result, '\t是否重试？')
-        num = int(input('0 重试\n1 退出\n'))
-        if num == 0:
-            multi_thread_download()
-            merge_txt_to_epub(txt_files, epub_file_path, author, cover, intro)
-        else:
-            break
-    else:
-        print('合并成功！')
-        break
+        exit(0)
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
+## Novel_Download
+从 `https://www.bqg221.com` 获取小说内容，将爬取的内容合并为txt或者epub格式。
+
+- 现已实现的功能
+- [x] 实现下载txt文档
+- [x] 实现合并txt
+- [x] 实现合并为epub
+- [x] 实现搜索功能
+- [ ] 实现多个书源替换
--- a/css/page_styles1.css
+++ b/css/page_styles1.css
@ -2,7 +2,15 @@
    margin-bottom: 5pt;
    margin-top: 5pt
    }
-@font-face {
-    font-family: yinbiao;
-    src: url(styles/XXXXXXXXXXXXXXXX)
-    }
+html {
+    overflow: hidden;
+}
+
+body {
+    height: 100%;
+    overflow: auto;
+    -webkit-column-width: 100%;
+    -moz-column-width: 100%;
+    column-width: 100%;
+    page-break-inside: avoid;
+}
--- a/pdm.lock
+++ b/pdm.lock
@ -46,6 +46,12 @@ dependencies = [
    "urllib3<3,>=1.21.1",
 ]

+[[package]]
+name = "setuptools"
+version = "68.0.0"
+requires_python = ">=3.7"
+summary = "Easily download, build, install, upgrade, and uninstall Python packages"
+
 [[package]]
 name = "six"
 version = "1.16.0"
@ -62,7 +68,7 @@ summary = "HTTP library with thread-safe connection pooling, file post, and more
 lock_version = "4.2"
 cross_platform = true
 groups = ["default"]
-content_hash = "sha256:2fff54024aa639561573351b1841dd1f0084ca345b52d35859bfae33744350e8"
+content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff297544cb6"

 [metadata.files]
 "certifi 2023.5.7" = [
@ -236,6 +242,10 @@ content_hash = "sha256:2fff54024aa639561573351b1841dd1f0084ca345b52d35859bfae337
    {url = "https://files.pythonhosted.org/packages/70/8e/0e2d847013cb52cd35b38c009bb167a1a26b2ce6cd6965bf26b47bc0bf44/requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
    {url = "https://files.pythonhosted.org/packages/9d/be/10918a2eac4ae9f02f6cfe6414b7a155ccd8f7f9d4380d62fd5b955065c3/requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
 ]
+"setuptools 68.0.0" = [
+    {url = "https://files.pythonhosted.org/packages/c7/42/be1c7bbdd83e1bfb160c94b9cafd8e25efc7400346cf7ccdbdb452c467fa/setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"},
+    {url = "https://files.pythonhosted.org/packages/dc/98/5f896af066c128669229ff1aa81553ac14cfb3e5e74b6b44594132b8540e/setuptools-68.0.0.tar.gz", hash = "sha256:baf1fdb41c6da4cd2eae722e135500da913332ab3f2f5c7d33af9b492acb5235"},
+]
 "six 1.16.0" = [
    {url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
    {url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,6 +9,7 @@ authors = [
 dependencies = [
    "requests>=2.31.0",
    "ebooklib>=0.18",
+    "setuptools>=68.0.0",
 ]
 requires-python = ">=3.11"
 license = {text = "MIT"}