import io
import os
import random
import shutil
import threading
import time
import requests
from bs4 import BeautifulSoup
import re
from ebooklib import epub
def get_user_agent():
# 定义多个User-Agent列表,每个列表中包含多个不同的User-Agent字符串
user_agents = [
[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.3',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'],
[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'],
[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38']
]
# 随机选择一个User-Agent列表
user_agent_list = random.choice(user_agents)
# 从选定的User-Agent列表中随机选择一个User-Agent字符串
user_agent = random.choice(user_agent_list)
return {'User-Agent': user_agent}
# 定义请求间隔时间(秒)
interval = 2
# 设置请求头,模拟浏览器访问
# 要爬取的小说主页链接
url = 'https://www.bqg221.com/xs/17931/'
# 发起请求并获取响应
response = requests.get(url, headers=get_user_agent())
# 将响应转换为BeautifulSoup对象
soup = BeautifulSoup(response.text, 'html.parser')
# 获取小说名字
title = soup.select_one('.book h1').get_text(strip=True)
print(title)
# 获取小说简介
# print(soup.select('.small')[0])
div_tag = soup.find('div', {'class': 'small'})
# print(div_tag)
all_span_tags = div_tag.find_all('span')
# print(all_span_tags)
author = all_span_tags[0].text.strip()[3:]
status = all_span_tags[1].text.strip()
update_time = all_span_tags[2].text.strip()
latest_update = all_span_tags[3].text.strip()
# for i in all_span_tags:
# print(i.text.strip())
intro = soup.select_one('.intro').get_text(strip=True)[:-6]
print(intro)
cover = soup.select_one('.cover img')['src']
# print(cover)
# 获取小说所有章节链接
chapter_urls = [url + i.get('href').split('/')[-1] for i in soup.select('.listmain a') if
i.get('href').split('/')[-1] != 'javascript:dd_show()']
# print(chapter_urls)
print('开始下载。。。')
# 停顿两秒
time.sleep(interval)
# 多线程下载txt
def download_url(chapter_url, file_name):
# 限制下载线程数
with semaphore:
file_path = './' + title
file_name = file_path + '/' + file_name
if not os.path.exists(file_path):
os.makedirs(file_path)
print('文件夹不存在,创建文件夹')
if os.path.exists(file_name) and os.path.getsize(file_name) > 0:
print(file_name + ' 已存在,跳过...\n')
# success_account =+ 1
else:
print('开始下载:' + file_name)
with open(file_name, 'w', encoding='utf-8') as f:
retry = 8
while retry > 0:
try:
response = requests.get(chapter_url, headers=get_user_agent(), timeout=5)
soup = BeautifulSoup(response.text, 'html.parser')
chapter_title = soup.select_one('.content h1').get_text()
print(chapter_title)
chapter_content = soup.select_one('div#chaptercontent').get_text().strip()
# print('before: '+chapter_content)
# # 将所有的
标签替换成换行符\n
chapter_content = chapter_content.replace(' ', '\n ')
# chapter_content = chapter_content.replace('
', '\n')
content = re.sub(r'(第\d+章|请收藏本站|『点此报错).*$', '', chapter_content, flags=re.MULTILINE)
# print(content)
# 将处理后的结果写入到test.txt文件中
f.write(chapter_title + '\n' + content + '\n\n')
# success_account = success_account + 1
break
# return True
except Exception as e:
print(e, '\n retry...')
time.sleep(interval)
retry -= 1
# return False
# txt转换为epub
def merge_txt_to_epub(txt_files=[], epub_file='', author='', cover='', direction=''):
"""
将txt转换为epub
txt_files (list) : 是一个列表
epub_file (str) :实际为转换成功的epub文件路径及名称
author (str) :作者
cover (str) :封面图片链接
direction (str) :书籍简介
"""
# 创建EPUB书籍对象
book = epub.EpubBook()
if os.path.exists(epub_file):
os.remove(epub_file)
# 设置元数据(可根据需要进行调整)
book.set_title(title)
book.set_language('zh')
book.add_author(author)
book.set_direction(direction)
# 添加封面
# 获取图片并将其转换为字节流
response = requests.get(cover)
stream = io.BytesIO(response.content)
book.set_cover('cover.jpg', stream.getvalue(), 'image/jpeg')
print('合并中。。。。。。')
# 书籍目录
book_spine = []
# 遍历所有txt文件
os.chdir(title)
for i, txt_file in enumerate(txt_files):
# 读取txt文件内容
with open(txt_file, 'r', encoding='utf-8') as file:
content = file.readlines()
try:
# 将所有换行符替换为
content = [s.replace('\n', '') for s in content if len(s.strip()) > 0]
# 获取章节标题
chapter_title = content[0]
# print(chapter_title)
# content = [' ' + line + '
' for line in content]
# 处理html文档
content[0] = f"""
' + line + '
' # content.append('