diff --git a/.gitignore b/.gitignore
index bd911c8..5d3607c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,4 @@ cython_debug/
/test_get_txt.py
/test_tmp.py
test_search.py
+test.py
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..9ee86e7
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+ "[python]": {
+ "editor.defaultFormatter": "ms-python.autopep8"
+ },
+ "python.formatting.provider": "none"
+}
\ No newline at end of file
diff --git a/Download_Novel.py b/Download_Novel.py
index a3b8a9b..6d40fd1 100644
--- a/Download_Novel.py
+++ b/Download_Novel.py
@@ -40,33 +40,42 @@ def get_user_agent():
class Download_Novel:
def search_novel(self):
- hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
- result = requests.get(hm_url, headers=get_user_agent()).text
- # print(result)
- hm = result[2:-2]
+ # hm_url = f'https://user.bqgso.cc/hm.html?&q={self.name}'
+ # result = requests.get(hm_url, headers=get_user_agent()).text
+ # # print(result)
+ # hm = result[2:-2]
# print(hm)
+ if self.name!=None:
# 发起请求并获取响应
- url = f'https://user.bqgso.cc/search.html?&q={self.name}&hm={hm}'
+ url = f'https://www.bqg222.com/user/search.html?q={self.name}'
+ # print(url)
+ print(requests.get(
+ url, headers=get_user_agent()).text[1:-1])
- response = json.loads(requests.get(url, headers=get_user_agent()).text[1:-1])
- # print(type(response))
- for i, book in enumerate(response):
- # i['url_list'][:9] = 'https://www'
- trans_url = book['url_list'].replace('https://m', 'https://www')
- response[i]['url_list'] = trans_url
+ response = json.loads(requests.get(
+ url, headers=get_user_agent()).text[1:-1])
+ # print(type(response))
+ for i, book in enumerate(response):
+ # i['url_list'][:9] = 'https://www'
+ # trans_url = book['url_list'].replace('https://m', 'https://www')
+ print(type(book['url_list']))
+ trans_url ='https://www.bqg221.com' + str(book['url_list'])
+ response[i]['url_list'] = trans_url
- # 返回一个json对象
- return response
+ # 返回一个json对象
+ return response
- def get_novel_info(self, response):
+ def get_novel_info(self, response=None):
# 定义请求间隔时间(秒)
interval = 2
# 设置请求头,模拟浏览器访问
-
+ if response!=None:
# 要爬取的小说主页链接
- url = response['url_list']
+ url = response['url_list']
+ else:
+ url = self.search_url
# 发起请求并获取响应
url_response = requests.get(url, headers=get_user_agent())
@@ -76,7 +85,7 @@ class Download_Novel:
# 获取小说名字
# title = soup.select_one('.book h1').get_text(strip=True)
- self.title = response['articlename']
+ self.title = soup.select_one('.book h1').get_text(strip=True)
print(self.title)
# 获取小说简介
@@ -86,7 +95,7 @@ class Download_Novel:
all_span_tags = div_tag.find_all('span')
# print(all_span_tags)
# author = all_span_tags[0].text.strip()[3:]
- self.author = response['author']
+ self.author = all_span_tags[0].text.strip()[3:]
self.status = all_span_tags[1].text.strip()
self.update_time = all_span_tags[2].text.strip()
self.latest_update = all_span_tags[3].text.strip()
@@ -96,7 +105,7 @@ class Download_Novel:
print(self.intro)
# cover = soup.select_one('.cover img')['src']
- self.cover = response['url_img']
+ self.cover = soup.select_one('.cover img')['src']
# print(cover)
# 获取小说所有章节链接
self.chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
@@ -138,7 +147,8 @@ class Download_Novel:
# if not os.path.exists(file_path):
# os.makedirs(file_path)
# # print('文件夹不存在,创建文件夹')
- file_name, status = self.get_multi_txt_file_status(file_name=file_name)
+ file_name, status = self.get_multi_txt_file_status(
+ file_name=file_name)
if status:
print(file_name + ' 已存在,跳过...\n')
@@ -149,15 +159,19 @@ class Download_Novel:
retry = 8
while retry > 0:
try:
- response = requests.get(chapter_url, headers=get_user_agent(), timeout=5)
+ response = requests.get(
+ chapter_url, headers=get_user_agent(), timeout=5)
soup = BeautifulSoup(response.text, 'html.parser')
- chapter_title = soup.select_one('.content h1').get_text()
+ chapter_title = soup.select_one(
+ '.content h1').get_text()
print(chapter_title)
- chapter_content = soup.select_one('div#chaptercontent').get_text().strip()
+ chapter_content = soup.select_one(
+ 'div#chaptercontent').get_text().strip()
# print('before: '+chapter_content)
# # 将所有的
标签替换成换行符\n
- chapter_content = chapter_content.replace(' ', '\n ')
+ chapter_content = chapter_content.replace(
+ ' ', '\n ')
# chapter_content = chapter_content.replace('
', '\n')
content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content,
flags=re.MULTILINE)
@@ -216,14 +230,16 @@ class Download_Novel:
convert_status = True
if convert_type == 0:
print(self.file_path, self.download_path + self.title + '.txt')
- convert_status = self.merge_txt_file(self.download_path + self.title + '.txt')
+ convert_status = self.merge_txt_file(
+ self.download_path + self.title + '.txt')
elif convert_type == 1:
txt_files = []
for n in range(0, len(self.chapter_urls)):
txt_files.append(self.file_path + str(n) + '.txt')
- convert_status = self.merge_txt_to_epub(txt_files, self.download_path + self.title + '.epub')
+ convert_status = self.merge_txt_to_epub(
+ txt_files, self.download_path + self.title + '.epub')
if convert_status:
print('合并成功!')
@@ -231,8 +247,8 @@ class Download_Novel:
print('合并失败!请删除downloads下面目录后重新运行程序!')
exit(1)
-
# 合并为txt文件
+
def merge_txt_file(self, merged_file_name=''):
"""
@@ -242,7 +258,8 @@ class Download_Novel:
# os.chdir(file_path)
if os.path.exists(merged_file_name):
os.remove(merged_file_name)
- print('merge file : ', sorted(os.listdir(self.file_path), key=lambda x: int(x.split('.')[0])))
+ print('merge file : ', sorted(os.listdir(self.file_path),
+ key=lambda x: int(x.split('.')[0])))
time.sleep(self.interval)
with open(merged_file_name, 'wb') as outfile:
@@ -306,7 +323,8 @@ class Download_Novel:
content = file.readlines()
try:
# 将所有换行符替换为
- content = [s.replace('\n', '') for s in content if len(s.strip()) > 0]
+ content = [s.replace('\n', '')
+ for s in content if len(s.strip()) > 0]
# 获取章节标题
chapter_title = content[0]
@@ -319,18 +337,19 @@ class Download_Novel:
for j, line in enumerate(content[1:]):
content[j + 1] = '
' + line + '
\n' - # content.append('