1. 主程序
我这里只写了一个main.py,就一个主函数解决了。
import requests
import re
from bs4 import BeautifulSoup
import os
if __name__ == '__main__':
novel_url = "https://www.bige3.com/book/1030/"
return_value = is_update(novel_url)
if return_value == 0:
print("小说尚未更新!")
else:
print("小说已更新" + str(return_value) +"章!")
print("正在下载已更新的小说......")
download_novel(return_value)
2. 功能函数
2.1 功能函数is_update()
def is_update(url):
heards = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
}
try:
resp = requests.get(url, headers=heards)
resp.raise_for_status()
resp.encoding = 'utf-8'
except:
print("爬取失败")
resp = re.findall(r'<a href =.*?>(.*?)</a>', resp.text)
with open("小说更新记录.txt", "r", encoding='utf-8') as f:
data = f.read()
if data == str(resp[-1]):
return 0
else:
data_num = re.findall(r'\d+', data)
data_num = ''.join(data_num)
resp_num = re.findall(r'\d+', resp[-1])
resp_num = ''.join(resp_num)
gap_num = int(resp_num)-int(data_num)
with open("小说更新记录.txt", "w"
, encoding='utf-8') as f:
f.write(str(resp[-1]))
print("writing is ok!")
return gap_num
2.2 功能函数download_novel(return_value)
def download_novel(return_value):
if return_value >= 1:
for i in range(1, return_value+1, 1):
print(i)
with open("小说更新记录.txt", "r", encoding='utf-8') as f:
data = f.read()
data_num = re.findall(r'\d+', data)
data_num = ''.join(data_num)
download_num = int(data_num)+1-(i-1)
print(novel_url+str(download_num)+'.html')
resp = requests.get(novel_url+str(download_num)+'.html')
soup = BeautifulSoup(resp.text, 'lxml')
soup.select('#chaptercontent')
mytxt = soup.text[soup.text.find('下一章'):soup.text.rfind('『点此报错')]
mytxt = mytxt[3:]
mytxt = mytxt.strip()
mytxt = mytxt.replace(' ', '\n')
novel_save_location = "./novel_downloads/逆天邪神第"+str(download_num-1)+"章.txt"
with open(novel_save_location, "w", encoding='utf-8') as f:
f.write(mytxt)
print("下载完毕!")
else:
print("invalid parameter!")
调试时要创建文件夹
novel_downloads
,并标注为Exclusion,防止pycharm自动创建索引,使电脑卡顿。
封装后的main.exe要保证它所在的路径下有两个东西:文件夹
novel_downloads
和文件
小说更新记录.txt
。
初始阶段保证文件
小说更新记录.txt
里有个数字就行,随便啥(1 or 1935等)
全部代码:(直接能爬)
import requests
from lxml import etree
import re
from bs4 import BeautifulSoup
import os
def is_update(url):
heards = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
}
try:
resp = requests.get(url, headers=heards)
resp.raise_for_status()
resp.encoding = 'utf-8'
except:
print("爬取失败")
resp = re.findall(r'<a href =.*?>(.*?)</a>', resp.text)
with open("小说更新记录.txt", "r", encoding='utf-8') as f:
data = f.read()
if data == str(resp[-1]):
return 0
else:
data_num = re.findall(r'\d+', data)
data_num = ''.join(data_num)
resp_num = re.findall(r'\d+', resp[-1])
resp_num = ''.join(resp_num)
gap_num = int(resp_num)-int(data_num)
with open("小说更新记录.txt", "w", encoding='utf-8') as f:
f.write(str(resp[-1]))
print("writing is ok!")
return gap_num
def download_novel(return_value):
if return_value >= 1:
for i in range(1, return_value+1, 1):
print(i)
with
open("小说更新记录.txt", "r", encoding='utf-8') as f:
data = f.read()
data_num = re.findall(r'\d+', data)
data_num = ''.join(data_num)
download_num = int(data_num)+1-(i-1)
print(novel_url+str(download_num)+'.html')
resp = requests.get(novel_url+str(download_num)+'.html')
soup = BeautifulSoup(resp.text, 'lxml')
soup.select('#chaptercontent')
mytxt = soup.text[soup.text.find('下一章'):soup.text.rfind('『点此报错')]
mytxt = mytxt[3:]
mytxt = mytxt.strip()
mytxt = mytxt.replace(' ', '\n')
novel_save_location = "./novel_downloads/逆天邪神第"+str(download_num-1)+"章.txt"
with open(novel_save_location, "w", encoding='utf-8') as f:
f.write(mytxt)
print("下载完毕!")
else:
print("invalid parameter!")
if __name__ == '__main__':
novel_url = "https://www.bige3.com/book/1030/"
return_value = is_update(novel_url)
if return_value == 0:
print("小说尚未更新!")
else:
print("小说已更新" + str(return_value) +"章!")
print("正在下载已更新的小说......")
download_novel(return_value)
os.system("pause")
缺点:单线程,没有用到异步协程,也没有用线程池实现对小说下载章节数较多时的快速下载优势。之后有空再优化代码,并实现相应的功能。
实现效果:
例如章节是目前是
最新章节为:1936章 灾厄奏鸣 ,我改个数字演示。
不改话,就没有新章节更新:
改后跑起来,应该是
对应的文件夹里是:
打开后内容是:
Over!!!!!
在pycharm项目路径下打开终端输入:
pip install pyinstaller
cd到项目的.py文件路径下
cd .\study_capture\novel_capture\
执行:
pyinstaller -F .\main.py
项目中用到的知识点:
这里面可以有些在优化程序时被我给去掉了,嘿嘿
请求网页数据
resp = requests.get(url, headers=heards)
python中list与string的转换
data_num = re.findall(r'\d+', data)
data_num = ''.join(data_num)
小说章节数的确认
resp = re.findall(r'<a href =.*?>(.*?)</a>', resp.text)
TXT文本的读取
encoding='utf-8' 是有必要的,不然会报错。
with open("小说更新记录.txt", "r", encoding='utf-8') as f:
data = f.read()
TXT文本的回写
with open("小说更新记录.txt", "w", encoding='utf-8') as f:
f.write(str(resp[-1]))
BS4对HTML进行值的筛选
#表示识别标签
soup = BeautifulSoup(resp.text, 'lxml')
soup.select('#chaptercontent')
取列表元素最后一个
resp[-1]
将列表中的章节数字拿出
data_num = re.findall(r'\d+', data)
python特定位置的字符串截取
soup.text str型
find('下一章') 左边开始第一个索引
rfind('『点此报错') 右边开始第一个索引
mytxt = soup.text[soup.text.find('下一章'):soup.text.rfind('『点此报错')]
字符串的拼接:
novel_save_location = "./novel_downloads/逆天邪神第"+str(download_num-1)+"章.txt"
小说保存时:
1.里面
有空白
,直接用
mytxt = mytxt.strip()
时没有去掉,不知道啥原因。我记得听网课说是:
去掉空格,空白,换行符
,其他好像都去了,最后还剩小说之间一些空白。
解决方式:因为没有发现是啥符号(notepad++),于是之间将空白拿过来用(copy)。
mytxt=mytxt.replace(' ', '\n')
感谢观看!!!第一次写,好慢,好菜,回去写作业去了。呜呜呜