python爬取全国城市历史天气数据

python爬取全国城市历史天气数据

  • 爬取全国城市2011至2020每天天气数据
  • 以requests+BeautifulSoup的方式抓取数据
  • 多线程爬取
  • 按城市名爬取后按省份存为xls
  • 从全国城市名称对应拼音构造字典时,存在城市拼音相同问题
  • 这个网站对城市的拼音有错误导致数据爬不到
  • 构造省份城市字典,按照省份创建文件夹归档
  • 数据来源:天气后报网站
  • 完整代码及数据:项目地址
    分析资源URL
    http://www.tianqihoubao.com/lishi/beijing/month/201101.html

易见http://www.tianqihoubao.com/lishi/{城市拼音}/month/{年月}.html

主要代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class Crawler(threading.Thread,):

def run(self):
print("%s is running" % threading.current_thread())
while True:
# 上锁
gLock.acquire()
if len(city_dict) == 0:
# 释放锁
gLock.release()
continue
else:
item = city_dict.popitem()
gLock.release()
data_ = list()
urls = self.get_urls(item[0])
for url in urls:
try:
data_.extend(self.get_data(url)) # 列表合并,将某个城市所有月份的天气信息写到data_
except Exception as e:
print(e)
pass
self.saveTocsv(data_, item[1]) # 保存为csv
if len(city_dict) == 0:
end = time.time()
print("消耗的时间为:", (end - start))
exit()

# 获取城市历史天气url
def get_urls(self,city_pinyin):
urls = []
for year in target_year_list:
for month in target_month_list:
date = year + month
# url = "http://www.tianqihoubao.com/lishi/beijing/month/201812.html"
urls.append("http://www.tianqihoubao.com/lishi/{}/month/{}.html".format(city_pinyin, date))
return urls

def get_soup(self,url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status() # 若请求不成功,抛出HTTPError 异常
soup = BeautifulSoup(r.text, "html.parser")
return soup
except Exception as e:
print(e)
pass

# 将天气数据保存至xls文件
def saveTocsv(self,data, city):
fileName = './weather_data/' + city + '天气.xls'
result_weather = pd.DataFrame(data, columns=['日期', '天气状况', '气温', '风力风向'])
# print(result_weather)
result_weather.to_excel(fileName, index=False)
print('Save all weather success!')
print('remain{}'.format(len(city_dict)))

def get_data(self,url):
print(url)
try:
soup = self.get_soup(url)
all_weather = soup.find('div', class_="wdetail").find('table').find_all("tr")
data = list()
for tr in all_weather[1:]:
td_li = tr.find_all("td")
for td in td_li:
s = td.get_text()
data.append("".join(s.split()))
res = np.array(data).reshape(-1, 4)
return res

except Exception as e:
print(e)
pass