diff --git a/.gitignore b/.gitignore index bd911c8..5d3607c 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ /test_get_txt.py /test_tmp.py test_search.py +test.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9ee86e7 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.autopep8" + }, + "python.formatting.provider": "none" +} \ No newline at end of file diff --git a/Download_Novel.py b/Download_Novel.py index a3b8a9b..6d40fd1 100644 --- a/Download_Novel.py +++ b/Download_Novel.py @@ -40,33 +40,42 @@ def get_user_agent(): class Download_Novel: def search_novel(self): - hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}' - result = requests.get(hm_url, headers=get_user_agent()).text - # print(result) - hm = result[2:-2] + # hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}' + # result = requests.get(hm_url, headers=get_user_agent()).text + # # print(result) + # hm = result[2:-2] # print(hm) + if self.name!=None: # 发起请求并获取响应 - url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}' + url = f'https://www.bqg222.com/user/search.html?q={self.name}' + # print(url) + print(requests.get( + url, headers=get_user_agent()).text[1:-1]) - response = json.loads(requests.get(url, headers=get_user_agent()).text[1:-1]) - # print(type(response)) - for i, book in enumerate(response): - # i['url_list'][:9] = 'https://www' - trans_url = book['url_list'].replace('https://m', 'https://www') - response[i]['url_list'] = trans_url + response = json.loads(requests.get( + url, headers=get_user_agent()).text[1:-1]) + # print(type(response)) + for i, book in enumerate(response): + # i['url_list'][:9] = 'https://www' + # trans_url = book['url_list'].replace('https://m', 'https://www') + print(type(book['url_list'])) + trans_url ='https://www.bqg221.com' + str(book['url_list']) + response[i]['url_list'] = trans_url - # 返回一个json对象 - return response + # 返回一个json对象 + return response - def get_novel_info(self, response): + def get_novel_info(self, response=None): # 定义请求间隔时间(秒) interval = 2 # 设置请求头,模拟浏览器访问 - + if response!=None: # 要爬取的小说主页链接 - url = response['url_list'] + url = response['url_list'] + else: + url = self.search_url # 发起请求并获取响应 url_response = requests.get(url, headers=get_user_agent()) @@ -76,7 +85,7 @@ class Download_Novel: # 获取小说名字 # title = soup.select_one('.book h1').get_text(strip=True) - self.title = response['articlename'] + self.title = soup.select_one('.book h1').get_text(strip=True) print(self.title) # 获取小说简介 @@ -86,7 +95,7 @@ class Download_Novel: all_span_tags = div_tag.find_all('span') # print(all_span_tags) # author = all_span_tags[0].text.strip()[3:] - self.author = response['author'] + self.author = all_span_tags[0].text.strip()[3:] self.status = all_span_tags[1].text.strip() self.update_time = all_span_tags[2].text.strip() self.latest_update = all_span_tags[3].text.strip() @@ -96,7 +105,7 @@ class Download_Novel: print(self.intro) # cover = soup.select_one('.cover img')['src'] - self.cover = response['url_img'] + self.cover = soup.select_one('.cover img')['src'] # print(cover) # 获取小说所有章节链接 self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if @@ -138,7 +147,8 @@ class Download_Novel: # if not os.path.exists(file_path): # os.makedirs(file_path) # # print('文件夹不存在,创建文件夹') - file_name, status = self.get_multi_txt_file_status(file_name=file_name) + file_name, status = self.get_multi_txt_file_status( + file_name=file_name) if status: print(file_name + ' 已存在,跳过...\n') @@ -149,15 +159,19 @@ class Download_Novel: retry = 8 while retry > 0: try: - response = requests.get(chapter_url, headers=get_user_agent(), timeout=5) + response = requests.get( + chapter_url, headers=get_user_agent(), timeout=5) soup = BeautifulSoup(response.text, 'html.parser') - chapter_title = soup.select_one('.content h1').get_text() + chapter_title = soup.select_one( + '.content h1').get_text() print(chapter_title) - chapter_content = soup.select_one('div#chaptercontent').get_text().strip() + chapter_content = soup.select_one( + 'div#chaptercontent').get_text().strip() # print('before: '+chapter_content) # # 将所有的
标签替换成换行符\n - chapter_content = chapter_content.replace('  ', '\n ') + chapter_content = chapter_content.replace( + '  ', '\n ') # chapter_content = chapter_content.replace('
', '\n') content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content, flags=re.MULTILINE) @@ -216,14 +230,16 @@ class Download_Novel: convert_status = True if convert_type == 0: print(self.file_path, self.download_path + self.title + '.txt') - convert_status = self.merge_txt_file(self.download_path + self.title + '.txt') + convert_status = self.merge_txt_file( + self.download_path + self.title + '.txt') elif convert_type == 1: txt_files = [] for n in range(0, len(self.chapter_urls)): txt_files.append(self.file_path + str(n) + '.txt') - convert_status = self.merge_txt_to_epub(txt_files, self.download_path + self.title + '.epub') + convert_status = self.merge_txt_to_epub( + txt_files, self.download_path + self.title + '.epub') if convert_status: print('合并成功!') @@ -231,8 +247,8 @@ class Download_Novel: print('合并失败!请删除downloads下面目录后重新运行程序!') exit(1) - # 合并为txt文件 + def merge_txt_file(self, merged_file_name=''): """ @@ -242,7 +258,8 @@ class Download_Novel: # os.chdir(file_path) if os.path.exists(merged_file_name): os.remove(merged_file_name) - print('merge file : ', sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0]))) + print('merge file : ', sorted(os.listdir(self.file_path), + key=lambda x: int(x.split('.')[0]))) time.sleep(self.interval) with open(merged_file_name, 'wb') as outfile: @@ -306,7 +323,8 @@ class Download_Novel: content = file.readlines() try: # 将所有换行符替换为
- content = [s.replace('\n', '') for s in content if len(s.strip()) > 0] + content = [s.replace('\n', '') + for s in content if len(s.strip()) > 0] # 获取章节标题 chapter_title = content[0] @@ -319,18 +337,19 @@ class Download_Novel: for j, line in enumerate(content[1:]): content[j + 1] = '

' + line + '

\n' - # content.append('') except IndexError as e: print(e) return False # 创建一个章节对象 - chapter = epub.EpubHtml(title=chapter_title, file_name='text/' + str(i) + '.xhtml') + chapter = epub.EpubHtml( + title=chapter_title, file_name='text/' + str(i) + '.xhtml') chapter.content = ''.join(content) # 将整个文件内容作为章节内容 # 下面的是将css文件引用到单个章节里面 # page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read() - page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read() + page_style1 = open('./css/page_styles1.css', + 'r', encoding='utf-8').read() style = open('./css/stylesheet.css', 'r', encoding='utf-8').read() # chapter.add_item( # epub.EpubItem(uid="page_style", file_name="../style/page_styles.css", media_type="text/css", @@ -345,7 +364,8 @@ class Download_Novel: # 将章节添加到书籍中 book.add_item(chapter) book.spine.append(chapter) - book.toc.append(epub.Link('text/' + str(i) + '.xhtml', chapter_title, str(i))) + book.toc.append(epub.Link('text/' + str(i) + + '.xhtml', chapter_title, str(i))) # print('xxxxxxxx:','text/' + str(i) + '.xhtml', chapter_title, str(i)) # 将目录添加到书籍中 @@ -354,11 +374,11 @@ class Download_Novel: book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) - - # 设置书籍的样式文件 - page_style = open('./css/page_styles.css', 'r', encoding='utf-8').read() - page_style1 = open('./css/page_styles1.css', 'r', encoding='utf-8').read() + page_style = open('./css/page_styles.css', + 'r', encoding='utf-8').read() + page_style1 = open('./css/page_styles1.css', + 'r', encoding='utf-8').read() style = open('./css/stylesheet.css', 'r', encoding='utf-8').read() book.add_item( @@ -374,7 +394,7 @@ class Download_Novel: epub.write_epub(epub_file, book, {}) return True - def __init__(self, name): + def __init__(self, name=None,search_url=None): self.file_path = None self.chapter_urls = None self.cover = None @@ -383,6 +403,7 @@ class Download_Novel: self.author = None self.title = None self.name = name + self.search_url=search_url # 定义请求间隔时间(秒) self.interval = 2 @@ -392,11 +413,20 @@ class Download_Novel: if __name__ == '__main__': - search_name = input('请输入要搜索的书籍名称: ') - if search_name: - download_novel = Download_Novel(search_name) - response = download_novel.search_novel() - print(response) + search_type=input('请选择你要下载的方式(0 or 1):\n0) 使用名称搜索\n1) 直接输入url(格式如:https://www.bqg221.com/biquge/17931/)\n') + + # if isinstance(search_type, int) and 0 <= search_type <= 1: + # download_novel + download_novel = Download_Novel() + if search_type == str(0): + search_name = input('请输入要搜索的书籍名称: ') + download_novel.name=search_name + else: + download_novel.search_url=search_type + + response = download_novel.search_novel() + # print(response) + if download_novel.name!=None: print('搜索到 ' + str(len(response)) + ' 个结果\n') print('---------------------------------------\n') for i, book in enumerate(reversed(response)): @@ -404,13 +434,25 @@ if __name__ == '__main__': 'intro'] + '...\n') print('---------------------------------------') print('---------------------------------------\n') - select_book = int(input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择):')) + select_book = int( + input(f'选择要下载的书籍序号(从0-{str(len(response) - 1)}中选择):')) # 判断输入合法 if isinstance(select_book, int) and 0 <= select_book <= len(response): download_novel.get_novel_info(response[select_book]) download_novel.download_process() else: print('输入内容不合法!') - else: - exit(0) + # print('---------------------------------------\n') + # # for i, book in enumerate(reversed(response)): + # print( ' 书籍名称:' + response['articlename'] + '\n作者:' + response['author'] + '\n简介:' + response[ + # 'intro'] + '...\n') + # print('---------------------------------------') + # print('---------------------------------------\n') + + download_novel.get_novel_info() + download_novel.download_process() + + + # else: + # exit(0) diff --git a/README.md b/README.md index af1a6b4..3a10530 100644 --- a/README.md +++ b/README.md @@ -6,4 +6,5 @@ - [x] 实现合并txt - [x] 实现合并为epub - [x] 实现搜索功能 -- [ ] 实现多个书源替换 \ No newline at end of file +- [ ] 实现多个书源替换 +- [x] 实现直接输入网址下载 \ No newline at end of file diff --git a/pdm.lock b/pdm.lock index f42a77a..dfcda84 100644 --- a/pdm.lock +++ b/pdm.lock @@ -1,6 +1,23 @@ # This file is @generated by PDM. # It is not intended for manual editing. +[[package]] +name = "beautifulsoup4" +version = "4.12.2" +requires_python = ">=3.6.0" +summary = "Screen-scraping library" +dependencies = [ + "soupsieve>1.2", +] + +[[package]] +name = "bs4" +version = "0.0.1" +summary = "Screen-scraping library" +dependencies = [ + "beautifulsoup4", +] + [[package]] name = "certifi" version = "2023.5.7" @@ -58,6 +75,12 @@ version = "1.16.0" requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" summary = "Python 2 and 3 compatibility utilities" +[[package]] +name = "soupsieve" +version = "2.4.1" +requires_python = ">=3.7" +summary = "A modern CSS selector implementation for Beautiful Soup." + [[package]] name = "urllib3" version = "2.0.3" @@ -68,9 +91,16 @@ summary = "HTTP library with thread-safe connection pooling, file post, and more lock_version = "4.2" cross_platform = true groups = ["default"] -content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff297544cb6" +content_hash = "sha256:cecd9231f4ed5227cc9f2c8a4225b2e84fc47b083042291cb5e26d8bc24a7199" [metadata.files] +"beautifulsoup4 4.12.2" = [ + {url = "https://files.pythonhosted.org/packages/57/f4/a69c20ee4f660081a7dedb1ac57f29be9378e04edfcb90c526b923d4bebc/beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, + {url = "https://files.pythonhosted.org/packages/af/0b/44c39cf3b18a9280950ad63a579ce395dda4c32193ee9da7ff0aed547094/beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, +] +"bs4 0.0.1" = [ + {url = "https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"}, +] "certifi 2023.5.7" = [ {url = "https://files.pythonhosted.org/packages/93/71/752f7a4dd4c20d6b12341ed1732368546bc0ca9866139fe812f6009d9ac7/certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, {url = "https://files.pythonhosted.org/packages/9d/19/59961b522e6757f0c9097e4493fa906031b95b3ebe9360b2c3083561a6b4/certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, @@ -250,6 +280,10 @@ content_hash = "sha256:095da8eef1987e0630ebddf4a4513ad826ad8630af33554248065ff29 {url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, ] +"soupsieve 2.4.1" = [ + {url = "https://files.pythonhosted.org/packages/47/9e/780779233a615777fbdf75a4dee2af7a345f4bf74b42d4a5f836800b9d91/soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"}, + {url = "https://files.pythonhosted.org/packages/49/37/673d6490efc51ec46d198c75903d99de59baffdd47aea3d071b80a9e4e89/soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"}, +] "urllib3 2.0.3" = [ {url = "https://files.pythonhosted.org/packages/8a/03/ad9306a50d05c166e3456fe810f33cee2b8b2a7a6818ec5d4908c4ec6b36/urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"}, {url = "https://files.pythonhosted.org/packages/d6/af/3b4cfedd46b3addab52e84a71ab26518272c23c77116de3c61ead54af903/urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"}, diff --git a/pyproject.toml b/pyproject.toml index 39c2796..15cced7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "requests>=2.31.0", "ebooklib>=0.18", "setuptools>=68.0.0", + "bs4>=0.0.1", ] requires-python = ">=3.11" license = {text = "MIT"} diff --git a/test_search.py b/test_search.py index 251e918..e6d3935 100644 --- a/test_search.py +++ b/test_search.py @@ -35,19 +35,23 @@ class Download_Novel: def search_novel(self): # 定义请求间隔时间(秒) interval = 2 - hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}' - result = requests.get(hm_url, headers=self.get_user_agent()).text + # hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}' + # result = requests.get(hm_url, headers=self.get_user_agent()).text # print(result) - hm = result[2:-2] + # hm = result[2:-2] # print(hm) # 发起请求并获取响应 - url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}' - + url = f'https://www.bqg221.com/user/search.html?q={self.name}' + print(url) + print('响应内容:',requests.get(url, headers=self.get_user_agent()).text) response = json.loads(requests.get(url, headers=self.get_user_agent()).text[1:-1]) + print(response) # print(type(response)) for i, book in enumerate(response): # i['url_list'][:9] = 'https://www' - trans_url = book['url_list'].replace('https://m', 'https://www') + # trans_url = book['url_list'].replace('https://m', 'https://www') + print(type(book['url_list'])) + trans_url ='https://www.bqg221.com' + str(book['url_list']) response[i]['url_list'] = trans_url # 返回一个json对象