利用python实现新浪微博爬虫_python新浪微博爬虫
本文后面的解决动态加载的程序依然有效
重新编辑了一次,出了点儿问题
第一个模块,模拟登陆sina微博,创建weiboLogin.py文件,输入以下代码:
-
#! /usr/bin/env python
-
# -*- coding: utf-8 -*-
-
-
import sys
-
import urllib
-
import urllib2
-
import cookielib
-
import base64
-
import re
-
import json
-
import hashlib
-
-
class weiboLogin:
-
cj = cookielib.LWPCookieJar()
-
cookie_support = urllib2.HTTPCookieProcessor(cj)
-
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
-
urllib2.install_opener(opener)
-
postdata = {
-
‘entry’: ‘weibo’,
-
‘gateway’: ‘1’,
-
‘from’: ”,
-
‘savestate’: ‘7’,
-
‘userticket’: ‘1’,
-
‘ssosimplelogin’: ‘1’,
-
‘vsnf’: ‘1’,
-
‘vsnval’: ”,
-
‘su’: ”,
-
‘service’: ‘miniblog’,
-
‘servertime’: ”,
-
‘nonce’: ”,
-
‘pwencode’: ‘wsse’,
-
‘sp’: ”,
-
‘encoding’: ‘UTF-8’,
-
‘url’: ‘http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack’,
-
‘returntype’: ‘META’
-
}
-
-
def get_servertime(self):
-
url = ‘http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939’
-
data = urllib2.urlopen(url).read()
-
p = re.compile(‘(.∗)‘)
-
try:
-
json_data = p.search(data).group(1)
-
data = json.loads(json_data)
-
servertime = str(data[‘servertime’])
-
nonce = data[‘nonce’]
-
return servertime, nonce
-
except:
-
print ‘Get severtime error!’
-
return None
-
-
def get_pwd(self, pwd, servertime, nonce):
-
pwd1 = hashlib.sha1(pwd).hexdigest()
-
pwd2 = hashlib.sha1(pwd1).hexdigest()
-
pwd3_ = pwd2 + servertime + nonce
-
pwd3 = hashlib.sha1(pwd3_).hexdigest()
-
return pwd3
-
-
def get_user(self, username):
-
username_ = urllib.quote(username)
-
username = base64.encodestring(username_)[:-1]
-
return username
-
-
-
def login(self,username,pwd):
-
url = ‘http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.18)’
-
try:
-
servertime, nonce = self.get_servertime()
-
except:
-
print ‘get servertime error!’
-
return
-
weiboLogin.postdata[‘servertime’] = servertime
-
weiboLogin.postdata[‘nonce’] = nonce
-
weiboLogin.postdata[‘su’] = self.get_user(username)
-
weiboLogin.postdata[‘sp’] = self.get_pwd(pwd, servertime, nonce)
-
weiboLogin.postdata = urllib.urlencode(weiboLogin.postdata)
-
headers = {‘User-Agent’:‘Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11’}
-
req = urllib2.Request(
-
url = url,
-
data = weiboLogin.postdata,
-
headers = headers
-
)
-
result = urllib2.urlopen(req)
-
text = result.read()
-
p = re.compile(‘location\.replace\’(.∗?)\’‘)
-
try:
-
login_url = p.search(text).group(1)
-
urllib2.urlopen(login_url)
-
print “Login success!”
-
except:
-
print ‘Login error!’
然后创建main.py文件,输入以下代码:
-
#!/usr/bin/env python
-
# -*- coding: utf-8 -*-
-
-
import weiboLogin
-
import urllib
-
import urllib2
-
-
username = ‘你的微博用户名’
-
pwd = ‘你的微博密码’
-
-
WBLogin = weiboLogin.weiboLogin()
-
WBLogin.login(username, pwd)
注意:若登陆失败,可能是你的账号在登陆的时候需要输入验证码!你在网页上登陆你的账号试试看,在账号设置里面可以设置某些地区不输入验证码。
接下来,考虑实现抓取微博的内容。
此时遇到一个困难,当抓取指定URL的微博时,初始显示只有15条。后面的是延迟显示的(ajax里面叫lazy load?)。也就是说,当滚动条第一次拖到最下面的时候,会显示第二部分,再拖到最下面,会显示第三部分。此时一个页面的微博才是完整的。所以,要获取一个微博页面的全部微博,需要访问这个页面三次。创建getWeiboPage.py文件,相应代码如下:
-
#!/usr/bin/env python
-
# -*- coding: utf-8 -*-
-
-
import urllib
-
import urllib2
-
import sys
-
import time
-
-
reload(sys)
-
sys.setdefaultencoding(‘utf-8’)
-
-
class getWeiboPage:
-
body = {
-
‘__rnd’:”,
-
‘_k’:”,
-
‘_t’:‘0’,
-
‘count’:’50’,
-
‘end_id’:”,
-
‘max_id’:”,
-
‘page’:1,
-
‘pagebar’:”,
-
‘pre_page’:‘0’,
-
‘uid’:”
-
}
-
uid_list = []
-
charset = ‘utf8’
-
-
def get_msg(self,uid):
-
getWeiboPage.body[‘uid’] = uid
-
url = self.get_url(uid)
-
self.get_firstpage(url)
-
self.get_secondpage(url)
-
self.get_thirdpage(url)
-
def get_firstpage(self,url):
-
getWeiboPage.body[‘pre_page’] = getWeiboPage.body[‘page’]-1
-
url = url +urllib.urlencode(getWeiboPage.body)
-
req = urllib2.Request(url)
-
result = urllib2.urlopen(req)
-
text = result.read()
-
self.writefile(‘./output/text1’,text)
-
self.writefile(‘./output/result1’,eval(“u”'”+text+“”'”))
-
-
def get_secondpage(self,url):
-
getWeiboPage.body[‘count’] = ’15’
-
# getWeiboPage.body[‘end_id’] = ‘3490160379905732’
-
# getWeiboPage.body[‘max_id’] = ‘3487344294660278’
-
getWeiboPage.body[‘pagebar’] = ‘0’
-
getWeiboPage.body[‘pre_page’] = getWeiboPage.body[‘page’]
-
-
url = url +urllib.urlencode(getWeiboPage.body)
-
req = urllib2.Request(url)
-
result = urllib2.urlopen(req)
-
text = result.read()
-
self.writefile(‘./output/text2’,text)
-
self.writefile(‘./output/result2’,eval(“u”'”+text+“”'”))
-
def get_thirdpage(self,url):
-
getWeiboPage.body[‘count’] = ’15’
-
getWeiboPage.body[‘pagebar’] = ‘1’
-
getWeiboPage.body[‘pre_page’] = getWeiboPage.body[‘page’]
-
-
url = url +urllib.urlencode(getWeiboPage.body)
-
req = urllib2.Request(url)
-
result = urllib2.urlopen(req)
-
text = result.read()
-
self.writefile(‘./output/text3’,text)
-
self.writefile(‘./output/result3’,eval(“u”'”+text+“”'”))
-
def get_url(self,uid):
-
url = ‘http://weibo.com/’ + uid + ‘?from=otherprofile&wvr=3.6&loc=tagweibo’
-
return url
-
def get_uid(self,filename):
-
fread = file(filename)
-
for line in fread:
-
getWeiboPage.uid_list.append(line)
-
print line
-
time.sleep(1)
-
def writefile(self,filename,content):
-
fw = file(filename,‘w’)
-
fw.write(content)
-
fw.close()
在刚刚的main.py中加入相应内容,完整内容为:
-
#!/usr/bin/env python
-
# -*- coding: utf-8 -*-
-
-
import weiboLogin
-
import getWeiboMsg
-
import urllib
-
import urllib2
-
-
username = ‘你的微博用户名’
-
pwd = ‘你的微博密码’
-
-
WBLogin = weiboLogin.weiboLogin()
-
WBLogin.login(username, pwd)
-
-
WBmsg = getWeiboMsg.getWeiboMsg()
-
url = ‘http://weibo.com/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo’
-
-
WBmsg.get_firstpage(url)
-
WBmsg.get_secondpage(url)
-
WBmsg.get_thirdpage(url)
CDA数据分析师考试相关入口一览(建议收藏):
▷ 想报名CDA认证考试,点击>>>
“CDA报名”
了解CDA考试详情;
▷ 想加入CDA考试题库,点击>>> “CDA题库” 了解CDA考试详情;
▷ 想学习CDA考试教材,点击>>> “CDA教材” 了解CDA考试详情;
▷ 想查询CDA考试成绩,点击>>> “CDA成绩” 了解CDA考试详情;
▷ 想了解CDA考试含金量,点击>>> “CDA含金量” 了解CDA考试详情;
▷ 想获取CDA考试时间/费用/条件/大纲/通过率,点击 >>>“CDA考试官网” 了解CDA考试详情;