python 动态网页信息爬取
爬虫爬取天气信息
实现了分页爬取,数据写入
#https://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=57516&areaInfo%5BareaType%5D=2&date%5Byear%5D=2018&date%5Bmonth%5D=3
import requests
import json
from bs4 import BeautifulSoup
import csv
def craw_json_html(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
response = requests.get(url=url, headers=headers)
print(response.status_code)
# response.encoding = response.apparent_encoding
response.encoding ='utf-8'
response = json.loads(response.text)
# print(response)
data=response["data"]
# print(data)
soup=BeautifulSoup(data,'html.parser')
table_trs=soup.find('table',class_='history-table').find_all('tr')
months=[]
for trs in table_trs:
texts=trs.text
newtext=list(filter(None,str(texts).split('\n')))
months.append(newtext)
days=months[1:]
for i in range(len(days)):
calenders=days[i][0].split(' ')[0]
days[i][0]=calenders
# print(days[i][0].split(' ')[0])
# print(type(days[i][0]))
# print(days)
#录入数据到csv
loadinfo_to_csv(days)
except:
pass
def index_all():
#爬取1到3月的数据
for i in range(3):
url = 'https://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=57516&areaInfo%5BareaType%5D=2&date%5Byear%5D=2018&date%5Bmonth%5D={}'.format(i+1)
craw_json_html(url)
def loadinfo_to_csv(infor):
file_name=infor[0][0][:7]
# print(file_name)
with open("{}月的天气数据.csv".format(file_name), "w", encoding="utf-8", newline="") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["日期", "最高温", "最低温","天气","风力风向","空气质量"])
for i in range(len(infor)):
# 4. 写入csv文件内容
csv_writer.writerow([infor[i][0], infor[i][1],infor[i][2],infor[i][3],infor[i][4],infor[i][5]])
print("{}月的天气数据写入成功".format(file_name))
pass
if __name__ == '__main__':
# url='https://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=57516&areaInfo%5BareaType%5D=2&date%5Byear%5D=2018&date%5Bmonth%5D=3'
# craw_json_html(url)
index_all()
结果实例展示
Skychang22: 求数据集大佬2010435481@qq.com,已经点赞加关注
Staprefer᭄: 求数据集大佬3012529047@qq.com谢谢
超越自我31: 请问数据集用的是哪个
m0_74970777: 兄弟问题解决了吗
l01190119: 作者的识别类型数量num_classes在哪设置的? 这个模型是默认只能训练不超过两种类别吗? 超过两种会报 RuntimeError: CUDA error: device-side assert triggered block: [0,0,0], thread: [11,0,0] Assertion `input_val >= zero && input_val <= one` failed 的错误