小说网站爬虫
1.小说爬虫简易模板
import requests
from bs4 import BeautifulSoup
import re
import os
# 本脚本无普适性,请具体网站具体分析
root = '' # 文章首页链接
art_id = '' # 文章链接id
fname = './txts/'+art_id[5:-1]+'.txt'
encode = 'gbk'
def GetAllSection(root, id):
r = requests.get(root + id)
r.encoding = encode
bs = BeautifulSoup(r.text, 'html.parser')
url_a = bs.find_all('a')
urls = []
for u in url_a:
href = u.get('href')
if href and re.match(id, href):
urls.append(root+href)
del urls[0]
return urls
def SaveArticle(root, id, fname):
i = 1
urls = GetAllSection(root, id)
for url in urls:
r = requests.get(url)
r.encoding = encode
bs = BeautifulSoup(r.text, 'html.parser')
text = bs.find_all('p')[1].text
text = text.replace('\xa0', ' ')
text = text.replace('\ufffd', ' ')
text = text.replace('\n\r\n', '\n')
with open(fname, 'a', encoding=encode) as file:
if i == 1:
file.write(bs.h1.text+'\n')
i = 0
file.write(text+'\n')
if __name__ == "__main__":
SaveArticle(root, art_id, fname)
# print(fname)
2.道德经爬虫
自己想做一个古文阅读类的安卓app,所以文章素材的话就只能自己去网上爬了。通过分析网站链接逻辑,和网站内文章内容的布局,很容易就可以写出来。
文章保存在当前目录下。
import requests
from bs4 import BeautifulSoup
import re
import os
encode = 'utf-8'
cha_num = 81
def SaveArticle(fname, urls):
for url in urls:
print('-', end='')
r = requests.get(url)
r.encoding = encode
bs = BeautifulSoup(r.text, 'html.parser')
text = bs.find_all('div', {"class": "grap"})
with open(fname, 'a', encoding=encode) as file:
for t in text:
te = t.text.replace('\xa0', ' ').replace(
'\u3000', ' ').replace('\ufffd', ' ')
file.write(te+'\n')
if __name__ == "__main__":
urls = ['https://www.5000yan.com/'+str(i)+'.html' for i in range(1, 82)]
SaveArticle('daodejing.txt', urls)
# print(urls)
All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.
Comment
GitalkValine