0%

基层治理爬虫及词云

老师想通过百度资讯里的文章,做一个基层治理相关新闻的 词云图

网址处理

1
2
3
4
5
6
7
import requests
from bs4 import BeautifulSoup
import csv
import jieba,wordcloud

base_url = "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E4%B8%A4%E9%82%BB+%22%E5%9F%BA%E5%B1%82%22&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn="
urls = [base_url+str(i) for i in range(0,91,10)]

对于基层治理和两邻的相关新闻,共96条,10页,使用range函数划分数组组成不同的搜索结果页面

获取所有网页的信息

因为数据量不大,所以直接以字典加列表的形式存储

1
2
3
4
5
6
7
8
9
10
11
results =[]
for url in urls:
res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'})
soup = BeautifulSoup(res.text,'lxml')
for i in soup.find_all("h3"):
result = {
"title":i.a["aria-label"],
"url":i.a["href"]
}
results.append(result)
print(result)

确定每个网站的新闻数量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
websites = []
for i in results:
if i["url"].split('/')[2] not in websites:
websites.append(i["url"].split('/')[2])

def web_count(urls,keyword):
m = 0
for j in urls:
if keyword in j["url"]:
m += 1
return m

for i in websites:
print(i,web_count(results,i))

得到排名前四的分别是:百家号,47,腾讯新闻,12,新浪新闻,6,搜狐新闻,8,共73条。

为每个网站编写规则

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def get_website_info(url):
result = []
p =[]
try:
res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'})
soup = BeautifulSoup(res.text,'lxml')
except:
return result
if url.split('/')[2] == "baijiahao.baidu.com": #47
p = soup.find_all("p")
if url.split('/')[2] == "new.qq.com":#12
p = soup.find_all(class_="one-p")
if url.split('/')[2] == "finance.sina.com.cn":#6
res.encoding = "utf-8"
soup = BeautifulSoup(res.text,'lxml')
p = soup.find_all("div",class_="article")
if url.split('/')[2] == "news.sohu.com":#4
p = soup.find_all("article",class_="article")
if url.split('/')[2] == "www.sohu.com":#4
p = soup.find_all("article",class_="article")
for i in p:
if i.text != "":
result.append(i.text)
Aresult = "".join(result)
return Aresult

由于抓取的内容是文字信息,且所在的网站无明显反爬策略,所以只需要找出文章所在的位置即可。其中新浪网页编码有一些问题,需要重载编码。

只指定了排名靠前的网站的抓取策略,其余的网站会自动过滤并返回空信息。

写入csv文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
countt = 0
with open("webinfo.csv","w",newline="",encoding="utf-8-sig") as f:
writer = csv.writer(f)
writer.writerow(["title","url","content"])
for i in results:
c = get_website_info(i["url"])
if c != "":
content = c
w = {
"title":i["title"],
"url":i["url"],
"content":c
}
writer.writerow([w["title"],w["url"],w["content"]])
countt += 1
if countt % 10 == 0:
print("已完成{}条".format(countt))
f.close()

分词处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# read webinfo.csv
p_list = []
with open("webinfo.csv","r",encoding="utf-8-sig") as f:
reader = csv.reader(f)
for i in reader:
if i[0] == "title":
continue
else:
p_list.append(i[2])

#将文章分词
p_list_2 = []
jieba.enable_paddle()
for i in p_list:
seg_list = jieba.cut(i, cut_all=False)#精确模式,即尽可能保留名词
p_list_2.append(" ".join(seg_list))

#读取停用词并删除
with open("baidu_stopwords.txt","r",encoding="utf-8-sig") as stopwords:
stop_words = [i.strip() for i in stopwords.readlines()]
for i in p_list_2:
if i in stop_words:
p_list_2.remove(i)

data_1 = ""
data_1 = "".join(p_list_2)

词云绘制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
import matplotlib.pyplot as plt
# 第三方库PIL是图片处理库,默认安装,如果没有,就需要自己安装
from PIL import Image

font = r'C:\Windows\Fonts\SIMLI.ttf';
py_mask = np.array(Image.open('LN6.png'))
# 读取颜色
img_colors = ImageColorGenerator(py_mask)
# 输入wordcloud
wc1 = WordCloud(mask = py_mask, font_path=font, stopwords=stop_words,background_color="white",width=400,height=400).generate(data_1)
# 生成词云
#wc1.generate(data_1)
# 上色
wc1.recolor(color_func=img_colors)
# 展示
plt.imshow(wc1, interpolation='bilinear')
plt.axis('off')
plt.show()
wc1.to_file('test.png')