Python爬虫获取豆瓣top250优秀电影

具体源码


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#-*- codeing = utf-8 -*-
#@Time : 2020/6/7 9:53
#@Author : 鱼头
#@File : test.py
#@Software : PyCharm


import bs4 #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行SQLite数据库操作
from bs4 import BeautifulSoup



def main():


baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
savepath = "豆瓣电影Top250.xls"
data = saveDate(datalist,savepath)

#抓取网页
def getData(baseurl):
datalist = []
for i in range(0,10): #调用获取页面信息的函数10次
url = baseurl + str(i*25)
html = askURL(url) #保存获取到的网页源码

#逐一解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,形成列表
#影片详情链接的规则
findLink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象,表示规则(字符串的模式)
#影片图片的链接
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #re.S 让换行符包含在字符中
#影片的片名
findTitle = re.compile(r'<span class="title">(.*)</span>')
#评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
#评论人数
findJudge = re.compile(r'<span>(\d*)人评价</span>')
#概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
#内容
findBd = re.compile(r'<p class="">(.*?)</p>',re.S)


#print(item)
data = []
item = str(item)
link = re.findall(findLink,item)[0] #re库用来通过正则表达式查找指定的字符串
data.append(link)
imgSrc = re.findall(findImgSrc,item)[0]
data.append(imgSrc)
titles = re.findall(findTitle,item)
if(len(titles) == 2):
ctitle = titles[0] #添加中国名
data.append(ctitle)
otitle = titles[1].replace("/","") #替换符号
data.append(otitle) #添加外国名
else:
data.append(titles[0])
data.append(' ')

rating = re.findall(findRating,item)[0]
data.append(rating)

judgeNum = re.findall(findJudge,item)[0]
data.append(judgeNum)

inq =re.findall(findInq,item)
if len(inq) !=0:
inq =inq[0].replace("。","")
data.append(inq)
else:
data.append(" ")

bd = re.findall(findBd,item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd) #去掉<br/>
bd = re.sub('/'," ",bd) #去除/
data.append(bd.strip()) #去空格

datalist.append(data)
#print(datalist)


return datalist


#得到指定一个URL的网页内容
def askURL(url):
head = { #模拟浏览器头部信息,向服务器发消息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}



request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e,"reason")

return html


#保存数据
def saveDate(datalist,savepath):
print("save...")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('豆瓣电影Top250',cell_overwrite_ok=True)
col = ("电影详情链接","图片链接","影片中文名","影片外国名","评分","评分人数","概括","相关信息")
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(0,250):
print("第%d条" %(i+1))
data = datalist[i]
for j in range(0,8):
sheet.write(i+1,j,data[j])

book.save(savepath)






if __name__ == "__main__":

main()