Python新浪微博相册爬虫

九 14 2013 Published by under Python

其实和上个爬虫基本一样,只是需要登录。

由于该死的验证码,就暂时没有web版了。

[code language="python"]

# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
import cookielib

#搞定cookie
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)

#获取页面html代码
def GetHtml(url,data=None):
req = urllib2.Request(url,data)
req.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.65')
return urllib2.urlopen(req).read()

#获取要抓取的页面url并转为对应的weibo.cn版的url
def GetUrl():
url = raw_input('请输入相册地址:')
if re.match('http://photo.weibo.com/(\d+)/albums/detail/album_id/(\d+)',url):
Matched = re.search('http://photo.weibo.com/(\d+)/albums/detail/album_id/(\d+)',url).groups()
return 'http://weibo.cn/album/' + Matched[1] + '000000' + Matched[0]
elif re.match('http://photo.weibo.com/(\d+)/talbum',url):
return 'http://weibo.cn/album/albummblog?fuid=' + re.search('http://photo.weibo.com/(\d+)/talbum',url).groups()[0]
else:
return None

#获取页面中的图片(其实是图片地址中不知道怎么称呼的一串字母数字)
def GetUrlList(url):
HTML = GetHtml(url)
Pattern = '<img src="http://ww\d.sinaimg.cn/square/(\w+?).\w{3}"'
return re.findall(Pattern, HTML, re.S)

#获取相册页数
def GetPageCount(url):
pattern ='type="hidden" value="(\d+?)"' #re.compile('type="hidden" value="(\d+)"')
PageNumber = re.findall(pattern,GetHtml(url))
if PageNumber: return PageNumber[0]
return 1 #Pattern.search(HTML).groups()[0]

#登录并抓取图片
def GetThingsDone():
url = GetUrl()
UrlList,num = [],1
if re.match("(.*)<a href='(.*?)'>登录</a>",GetHtml('http://weibo.cn/pub/')):
#----------------------模拟登录过程-------------------
#------------------需要修改用户名、密码----------------
HTML = GetHtml('http://login.weibo.cn/login/')
action_url = re.findall('<form action="(.+?)" method="post">',HTML)[0]
passwd_name = re.findall('<input type="password" name="(.+?)" size="30" value=""/>',HTML)[0]
vk_value = re.findall('<input type="hidden" name="vk" value="(.+?)" />',HTML)[0]
Data = urllib.urlencode({
'mobile':'yourname',
passwd_name:'yourpassword',
'vk':vk_value,
'backURL':'http://weibo.cn/',
'backTitle':'手机新浪网',
'remember':'on',
'tryCount':'',
'submit':'登录',})
GetHtml('http://login.weibo.cn/login/'+action_url,Data)
#-----------------------------------------------------

PageCount = int(GetPageCount(url))
while (num <= PageCount):
UrlList += GetUrlList(url + '&page=' + str(num))
num += 1
return ['http://ww4.sinaimg.cn/large/' + i + '.jpg' for i in UrlList]

Images = GetThingsDone()
for image in Images:
print image

[/code]

本文链接地址: Python新浪微博相册爬虫



Tags:

4 responses so far

发表评论