import requests from bs4 import BeautifulSoup import csv import jieba,wordcloud
base_url = "https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E4%B8%A4%E9%82%BB+%22%E5%9F%BA%E5%B1%82%22&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=" urls = [base_url+str(i) for i inrange(0,91,10)]
对于基层治理和两邻的相关新闻,共96条,10页,使用range函数划分数组组成不同的搜索结果页面
获取所有网页的信息
因为数据量不大,所以直接以字典加列表的形式存储
1 2 3 4 5 6 7 8 9 10 11
results =[] for url in urls: res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}) soup = BeautifulSoup(res.text,'lxml') for i in soup.find_all("h3"): result = { "title":i.a["aria-label"], "url":i.a["href"] } results.append(result) print(result)
确定每个网站的新闻数量
1 2 3 4 5 6 7 8 9 10 11 12 13 14
websites = [] for i in results: if i["url"].split('/')[2] notin websites: websites.append(i["url"].split('/')[2])
defweb_count(urls,keyword): m = 0 for j in urls: if keyword in j["url"]: m += 1 return m
defget_website_info(url): result = [] p =[] try: res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}) soup = BeautifulSoup(res.text,'lxml') except: return result if url.split('/')[2] == "baijiahao.baidu.com": #47 p = soup.find_all("p") if url.split('/')[2] == "new.qq.com":#12 p = soup.find_all(class_="one-p") if url.split('/')[2] == "finance.sina.com.cn":#6 res.encoding = "utf-8" soup = BeautifulSoup(res.text,'lxml') p = soup.find_all("div",class_="article") if url.split('/')[2] == "news.sohu.com":#4 p = soup.find_all("article",class_="article") if url.split('/')[2] == "www.sohu.com":#4 p = soup.find_all("article",class_="article") for i in p: if i.text != "": result.append(i.text) Aresult = "".join(result) return Aresult
countt = 0 withopen("webinfo.csv","w",newline="",encoding="utf-8-sig") as f: writer = csv.writer(f) writer.writerow(["title","url","content"]) for i in results: c = get_website_info(i["url"]) if c != "": content = c w = { "title":i["title"], "url":i["url"], "content":c } writer.writerow([w["title"],w["url"],w["content"]]) countt += 1 if countt % 10 == 0: print("已完成{}条".format(countt)) f.close()
# read webinfo.csv p_list = [] withopen("webinfo.csv","r",encoding="utf-8-sig") as f: reader = csv.reader(f) for i in reader: if i[0] == "title": continue else: p_list.append(i[2])
#将文章分词 p_list_2 = [] jieba.enable_paddle() for i in p_list: seg_list = jieba.cut(i, cut_all=False)#精确模式,即尽可能保留名词 p_list_2.append(" ".join(seg_list))
#读取停用词并删除 withopen("baidu_stopwords.txt","r",encoding="utf-8-sig") as stopwords: stop_words = [i.strip() for i in stopwords.readlines()] for i in p_list_2: if i in stop_words: p_list_2.remove(i)
from wordcloud import WordCloud, ImageColorGenerator import numpy as np import matplotlib.pyplot as plt # 第三方库PIL是图片处理库,默认安装,如果没有,就需要自己安装 from PIL import Image