添加直接输入网页下载功能，搜索功能还有bug没修复

2023-07-18 12:49:14 +08:00 · 2023-07-18 12:49:14 +08:00 · ce7e0042ba
commit ce7e0042ba
parent edd85309db
7 changed files with 143 additions and 54 deletions
--- a/.gitignore
+++ b/.gitignore
@ -162,3 +162,4 @@ cython_debug/
 /test_get_txt.py
 /test_tmp.py
 test_search.py
+test.py
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,6 @@
+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.autopep8"
+    },
+    "python.formatting.provider": "none"
+}
--- a/Download_Novel.py
+++ b/Download_Novel.py
@ -40,33 +40,42 @@ def get_user_agent():
 class Download_Novel:

    def search_novel(self):
-        hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
-        result = requests.get(hm_url, headers=get_user_agent()).text
-        # print(result)
-        hm = result[2:-2]
+        # hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
+        # result = requests.get(hm_url, headers=get_user_agent()).text
+        # # print(result)
+        # hm = result[2:-2]
        # print(hm)
+        if self.name!=None:
        # 发起请求并获取响应
-        url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}'
+            url = f'https://www.bqg222.com/user/search.html?q={self.name}'
+            # print(url)
+            print(requests.get(
+                url, headers=get_user_agent()).text[1:-1])

-        response = json.loads(requests.get(url, headers=get_user_agent()).text[1:-1])
-        # print(type(response))
-        for i, book in enumerate(response):
-            # i['url_list'][:9] = 'https://www'
-            trans_url = book['url_list'].replace('https://m', 'https://www')
-            response[i]['url_list'] = trans_url
+            response = json.loads(requests.get(
+                url, headers=get_user_agent()).text[1:-1])
+            # print(type(response))
+            for i, book in enumerate(response):
+                # i['url_list'][:9] = 'https://www'
+                # trans_url = book['url_list'].replace('https://m', 'https://www')
+                print(type(book['url_list']))
+                trans_url ='https://www.bqg221.com' + str(book['url_list'])
+                response[i]['url_list'] = trans_url

-        # 返回一个json对象
-        return response
+            # 返回一个json对象
+            return response

-    def get_novel_info(self, response):
+    def get_novel_info(self, response=None):

        # 定义请求间隔时间（秒）
        interval = 2

        # 设置请求头，模拟浏览器访问
-
+        if response!=None:
        # 要爬取的小说主页链接
-        url = response['url_list']
+            url = response['url_list']
+        else:
+            url = self.search_url

        # 发起请求并获取响应
        url_response = requests.get(url, headers=get_user_agent())
@ -76,7 +85,7 @@ class Download_Novel:

        # 获取小说名字
        # title = soup.select_one('.book h1').get_text(strip=True)
-        self.title = response['articlename']
+        self.title = soup.select_one('.book h1').get_text(strip=True)
        print(self.title)

        # 获取小说简介
@ -86,7 +95,7 @@ class Download_Novel:
        all_span_tags = div_tag.find_all('span')
        # print(all_span_tags)
        # author = all_span_tags[0].text.strip()[3:]
-        self.author = response['author']
+        self.author = all_span_tags[0].text.strip()[3:]
        self.status = all_span_tags[1].text.strip()
        self.update_time = all_span_tags[2].text.strip()
        self.latest_update = all_span_tags[3].text.strip()
@ -96,7 +105,7 @@ class Download_Novel:
        print(self.intro)

        # cover = soup.select_one('.cover img')['src']
-        self.cover = response['url_img']
+        self.cover = soup.select_one('.cover img')['src']
        # print(cover)
        # 获取小说所有章节链接
        self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
@ -138,7 +147,8 @@ class Download_Novel:
            # if not os.path.exists(file_path):
            #     os.makedirs(file_path)
            #     # print('文件夹不存在，创建文件夹')
-            file_name, status = self.get_multi_txt_file_status(file_name=file_name)
+            file_name, status = self.get_multi_txt_file_status(
+                file_name=file_name)

            if status:
                print(file_name + ' 已存在，跳过...\n')
@ -149,15 +159,19 @@ class Download_Novel:
                    retry = 8
                    while retry > 0:
                        try:
-                            response = requests.get(chapter_url, headers=get_user_agent(), timeout=5)
+                            response = requests.get(
+                                chapter_url, headers=get_user_agent(), timeout=5)
                            soup = BeautifulSoup(response.text, 'html.parser')

-                            chapter_title = soup.select_one('.content h1').get_text()
+                            chapter_title = soup.select_one(
+                                '.content h1').get_text()
                            print(chapter_title)
-                            chapter_content = soup.select_one('div#chaptercontent').get_text().strip()
+                            chapter_content = soup.select_one(
+                                'div#chaptercontent').get_text().strip()
                            # print('before: '+chapter_content)
                            # # 将所有的<br>标签替换成换行符\n
-                            chapter_content = chapter_content.replace('　　', '\n ')
+                            chapter_content = chapter_content.replace(
+                                '　　', '\n ')
                            # chapter_content = chapter_content.replace('<br>', '\n')
                            content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content,
                                             flags=re.MULTILINE)
@ -216,14 +230,16 @@ class Download_Novel:
            convert_status = True
            if convert_type == 0:
                print(self.file_path, self.download_path + self.title + '.txt')
-                convert_status = self.merge_txt_file(self.download_path + self.title + '.txt')
+                convert_status = self.merge_txt_file(
+                    self.download_path + self.title + '.txt')

            elif convert_type == 1:
                txt_files = []
                for n in range(0, len(self.chapter_urls)):
                    txt_files.append(self.file_path + str(n) + '.txt')

-                convert_status = self.merge_txt_to_epub(txt_files, self.download_path + self.title + '.epub')
+                convert_status = self.merge_txt_to_epub(
+                    txt_files, self.download_path + self.title + '.epub')

            if convert_status:
                print('合并成功！')
@ -231,8 +247,8 @@ class Download_Novel:
                print('合并失败！请删除downloads下面目录后重新运行程序！')
                exit(1)

-
    # 合并为txt文件
+
    def merge_txt_file(self, merged_file_name=''):
        """

@ -242,7 +258,8 @@ class Download_Novel:
        # os.chdir(file_path)
        if os.path.exists(merged_file_name):
            os.remove(merged_file_name)
-        print('merge file : ', sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0])))
+        print('merge file : ', sorted(os.listdir(self.file_path),
+              key=lambda x: int(x.split('.')[0])))
        time.sleep(self.interval)

        with open(merged_file_name, 'wb') as outfile:
@ -306,7 +323,8 @@ class Download_Novel:
                content = file.readlines()
            try:
                # 将所有换行符替换为<br>
-                content = [s.replace('\n', '') for s in content if len(s.strip()) > 0]
+                content = [s.replace('\n', '')
+                           for s in content if len(s.strip()) > 0]

                # 获取章节标题
                chapter_title = content[0]
@ -319,18 +337,19 @@ class Download_Novel:
                for j, line in enumerate(content[1:]):
                    content[j + 1] = '<p class="calibre3">' + line + '</p>\n'

-
                # content.append('</body></html>')
            except IndexError as e:
                print(e)
                return False
            # 创建一个章节对象

-            chapter = epub.EpubHtml(title=chapter_title, file_name='text/' + str(i) + '.xhtml')
+            chapter = epub.EpubHtml(
+                title=chapter_title, file_name='text/' + str(i) + '.xhtml')
            chapter.content = ''.join(content)  # 将整个文件内容作为章节内容
            # 下面的是将css文件引用到单个章节里面
            # page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
-            page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()
+            page_style1 = open('./css/page_styles1.css',
+                               'r', encoding='utf-8').read()
            style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
            # chapter.add_item(
            #     epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css",
@ -345,7 +364,8 @@ class Download_Novel:
            # 将章节添加到书籍中
            book.add_item(chapter)
            book.spine.append(chapter)
-            book.toc.append(epub.Link('text/' + str(i) + '.xhtml', chapter_title, str(i)))
+            book.toc.append(epub.Link('text/' + str(i) +
+                            '.xhtml', chapter_title, str(i)))
            # print('xxxxxxxx:','text/' + str(i) + '.xhtml', chapter_title, str(i))

        # 将目录添加到书籍中
@ -354,11 +374,11 @@ class Download_Novel:
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())

-
-
        # 设置书籍的样式文件
-        page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read()
-        page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read()
+        page_style = open('./css/page_styles.css',
+                          'r', encoding='utf-8').read()
+        page_style1 = open('./css/page_styles1.css',
+                           'r', encoding='utf-8').read()

        style = open('./css/stylesheet.css', 'r', encoding='utf-8').read()
        book.add_item(
@ -374,7 +394,7 @@ class Download_Novel:
        epub.write_epub(epub_file, book, {})
        return True

-    def __init__(self, name):
+    def __init__(self, name=None,search_url=None):
        self.file_path = None
        self.chapter_urls = None
        self.cover = None
@ -383,6 +403,7 @@ class Download_Novel:
        self.author = None
        self.title = None
        self.name = name
+        self.search_url=search_url

        # 定义请求间隔时间（秒）
        self.interval = 2
@ -392,11 +413,20 @@ class Download_Novel:


 if __name__ == '__main__':
-    search_name = input('请输入要搜索的书籍名称： ')
-    if search_name:
-        download_novel = Download_Novel(search_name)
-        response = download_novel.search_novel()
-        print(response)
+    search_type=input('请选择你要下载的方式(0 or 1)：\n0) 使用名称搜索\n1) 直接输入url(格式如：https://www.bqg221.com/biquge/17931/)\n')
+    
+    # if isinstance(search_type, int) and 0 <= search_type <= 1:
+        # download_novel
+    download_novel = Download_Novel()
+    if search_type == str(0):
+        search_name = input('请输入要搜索的书籍名称： ')
+        download_novel.name=search_name
+    else:
+        download_novel.search_url=search_type
+        
+    response = download_novel.search_novel()
+    # print(response)
+    if download_novel.name!=None:
        print('搜索到 ' + str(len(response)) + ' 个结果\n')
        print('---------------------------------------\n')
        for i, book in enumerate(reversed(response)):
@ -404,13 +434,25 @@ if __name__ == '__main__':
                'intro'] + '...\n')
            print('---------------------------------------')
            print('---------------------------------------\n')
-        select_book = int(input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择)：'))
+        select_book = int(
+            input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择)：'))
        # 判断输入合法
        if isinstance(select_book, int) and 0 <= select_book <= len(response):
            download_novel.get_novel_info(response[select_book])
            download_novel.download_process()
        else:
            print('输入内容不合法！')
-
    else:
-        exit(0)
+        # print('---------------------------------------\n')
+        # # for i, book in enumerate(reversed(response)):
+        # print( ' 书籍名称：' + response['articlename'] + '\n作者：' + response['author'] + '\n简介：' + response[
+        #     'intro'] + '...\n')
+        # print('---------------------------------------')
+        # print('---------------------------------------\n')
+
+        download_novel.get_novel_info()
+        download_novel.download_process()
+    
+
+    # else:
+    #     exit(0)
--- a/README.md
+++ b/README.md
@ -7,3 +7,4 @@
 - [x] 实现合并为epub
 - [x] 实现搜索功能
 - [ ] 实现多个书源替换
+- [x] 实现直接输入网址下载
--- a/pdm.lock
+++ b/pdm.lock
@ -1,6 +1,23 @@
 # This file is @generated by PDM.
 # It is not intended for manual editing.

+[[package]]
+name = "beautifulsoup4"
+version = "4.12.2"
+requires_python = ">=3.6.0"
+summary = "Screen-scraping library"
+dependencies = [
+    "soupsieve>1.2",
+]
+
+[[package]]
+name = "bs4"
+version = "0.0.1"
+summary = "Screen-scraping library"
+dependencies = [
+    "beautifulsoup4",
+]
+
 [[package]]
 name = "certifi"
 version = "2023.5.7"
@ -58,6 +75,12 @@ version = "1.16.0"
 requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 summary = "Python 2 and 3 compatibility utilities"

+[[package]]
+name = "soupsieve"
+version = "2.4.1"
+requires_python = ">=3.7"
+summary = "A modern CSS selector implementation for Beautiful Soup."
+
 [[package]]
 name = "urllib3"
 version = "2.0.3"
@ -68,9 +91,16 @@ summary = "HTTP library with thread-safe connection pooling, file post, and more
 lock_version = "4.2"
 cross_platform = true
 groups = ["default"]
-content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff297544cb6"
+content_hash = "sha256:cecd9231f4ed5227cc9f2c8a4225b2e84fc47b083042291cb5e26d8bc24a7199"

 [metadata.files]
+"beautifulsoup4 4.12.2" = [
+    {url = "https://files.pythonhosted.org/packages/57/f4/a69c20ee4f660081a7dedb1ac57f29be9378e04edfcb90c526b923d4bebc/beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
+    {url = "https://files.pythonhosted.org/packages/af/0b/44c39cf3b18a9280950ad63a579ce395dda4c32193ee9da7ff0aed547094/beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
+]
+"bs4 0.0.1" = [
+    {url = "https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"},
+]
 "certifi 2023.5.7" = [
    {url = "https://files.pythonhosted.org/packages/93/71/752f7a4dd4c20d6b12341ed1732368546bc0ca9866139fe812f6009d9ac7/certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"},
    {url = "https://files.pythonhosted.org/packages/9d/19/59961b522e6757f0c9097e4493fa906031b95b3ebe9360b2c3083561a6b4/certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"},
@ -250,6 +280,10 @@ content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff29
    {url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
    {url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
 ]
+"soupsieve 2.4.1" = [
+    {url = "https://files.pythonhosted.org/packages/47/9e/780779233a615777fbdf75a4dee2af7a345f4bf74b42d4a5f836800b9d91/soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
+    {url = "https://files.pythonhosted.org/packages/49/37/673d6490efc51ec46d198c75903d99de59baffdd47aea3d071b80a9e4e89/soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
+]
 "urllib3 2.0.3" = [
    {url = "https://files.pythonhosted.org/packages/8a/03/ad9306a50d05c166e3456fe810f33cee2b8b2a7a6818ec5d4908c4ec6b36/urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"},
    {url = "https://files.pythonhosted.org/packages/d6/af/3b4cfedd46b3addab52e84a71ab26518272c23c77116de3c61ead54af903/urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,6 +10,7 @@ dependencies = [
    "requests>=2.31.0",
    "ebooklib>=0.18",
    "setuptools>=68.0.0",
+    "bs4>=0.0.1",
 ]
 requires-python = ">=3.11"
 license = {text = "MIT"}
--- a/test_search.py
+++ b/test_search.py
@ -35,19 +35,23 @@ class Download_Novel:
    def search_novel(self):
        # 定义请求间隔时间（秒）
        interval = 2
-        hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
-        result = requests.get(hm_url, headers=self.get_user_agent()).text
+        # hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
+        # result = requests.get(hm_url, headers=self.get_user_agent()).text
        # print(result)
-        hm = result[2:-2]
+        # hm = result[2:-2]
        # print(hm)
        # 发起请求并获取响应
-        url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}'
-
+        url = f'https://www.bqg221.com/user/search.html?q={self.name}'
+        print(url)
+        print('响应内容：',requests.get(url, headers=self.get_user_agent()).text)
        response = json.loads(requests.get(url, headers=self.get_user_agent()).text[1:-1])
+        print(response)
        # print(type(response))
        for i, book in enumerate(response):
            # i['url_list'][:9] = 'https://www'
-            trans_url = book['url_list'].replace('https://m', 'https://www')
+            # trans_url = book['url_list'].replace('https://m', 'https://www')
+            print(type(book['url_list']))
+            trans_url ='https://www.bqg221.com' + str(book['url_list'])
            response[i]['url_list'] = trans_url

        # 返回一个json对象