看了点Python,写个爬虫练手。
[code language="python"]
# -*- coding: UTF-8 -*-
import urllib2
import re
#获取帖子页数
def GetPageCount(url):
Pattern = re.compile(r'max-page="(\d)"')
Source = urllib2.urlopen(url).read()
if Pattern.search(Source):
return int(Pattern.search(Source).groups()[0])
else:
return 1
#抓取帖子内容
def Analyze(url,tag):
if not (re.match('http://tieba.baidu.com/p/\w*',url) and url):
return None
url = re.findall('http://tieba.baidu.com/p/\w*',url)[0]+'?see_lz=1'
imgurl = []
content=''
global PageCount
PageCount = GetPageCount(url)
count = 1
#当tag为'all'时抓取回复全文
if tag=='all':
while count <= PageCount:
PageUrl = url + '&pn=' + str(count)
HTML = urllib2.urlopen(PageUrl).read().decode('gbk').encode('UTF-8')
Pattern = re.compile(r'http://.*/sign=\w*?/(\w*?)\.jpg')
content += '<br />'.join(re.findall('<cc>(.*?)</cc>', HTML, re.S))
count +=1
return content
#否则仅抓取回复图片
while count <= PageCount:
PageUrl = url + '&pn=' + str(count)
HTML = urllib2.urlopen(PageUrl).read().decode('gbk').encode('UTF-8')
Pattern = re.compile(r'http://.*/sign=\w*?/(\w*?)\.jpg')
for i in re.findall('<cc>(.*?)</cc>', HTML, re.S):
for i in re.findall('<img.*?src="(.*?)"',i):
#转为原始大小的图片
if re.match(Pattern,i):
i = 'http://imgsrc.baidu.com/forum/pic/item/' + Pattern.search(i).groups()[0] +'.jpg'
#表情什么的就不要了
if (re.match(r'http://static.tieba.baidu.com/tb/editor/images/.*',i)):
continue
imgurl.append(i)
count = count + 1
return imgurl
#web版本时用于防'防盗链'
def ChangeUrl(text):
if text:
for i in set(re.findall('http://hiphotos.baidu.com/.*?.jpg',text)):
text=text.replace(i,'http://www.beihaiw.com/pic.php?url='+i)
return text
else: return None
[/code]
用Tornado框架写个web版本,架在GAE上。
http://nerdsproject01.appspot.com/tieba
访问不了可以试试http://proj.a-nerd.info/,通过post方式获取GAE的结果。
[code language="python"]
# -*- coding: utf-8 -*-
import os
import tornado.wsgi
import tornado.web
import Tieba
#当直接访问时返回index.html
class MainHandler(tornado.web.RequestHandler):
def get(self, ):
self.render('index.html')
#返回图片网址
class Img(tornado.web.RequestHandler):
def post(self, ):
url = self.get_argument('url')
if url:
urllist = Tieba.Analyze(url,'img')
if urllist:
self.write('<p><center>共爬了'+str(Tieba.PageCount)+'页<br>')
self.write('找到了'+str(len(urllist))+'张图片,累死了喵</center></p>')
self.write('<br>'.join(urllist))
else: self.write('<center><p>喵的,请输入正确的贴吧地址(╯‵□′)╯︵┴─┴</p></center>')
#返回图片的HTML代码,用于转载
class ImgHtml(tornado.web.RequestHandler):
def post(self, ):
url = self.get_argument('url')
if url:
urllist = Tieba.Analyze(url,'img')
if urllist:
self.write('<p><center>共爬了'+str(Tieba.PageCount)+'页<br>')
self.write('找到了'+str(len(urllist))+'张图片,累死了喵</center></p>')
self.write('<center><textarea rows="15" cols="111">')
for i in urllist:
self.write('<img src="'+i+'"><br>\n')
self.write("</textarea></center>")
else: self.write('<center><p>喵的,请输入正确的贴吧地址(╯‵□′)╯︵┴─┴</p></center>')
#返回所有回复
class All(tornado.web.RequestHandler):
def post(self, ):
url = self.get_argument('url')
if url:
Source=Tieba.ChangeUrl(Tieba.Analyze(url,'all'))
if Source:
self.write('<p><center>共爬了'+str(Tieba.PageCount)+'页<br>')
self.write('找到了好多字,累死了喵</center></p>')
self.write(Source)#("<pre>"+Source+"</pre>")
else: self.write('<center><p>喵的,请输入正确的贴吧地址(╯‵□′)╯︵┴─┴</p></center>')
#返回回复的HTML代码,用于保存和转载
class AllHtml(tornado.web.RequestHandler):
def post(self, ):
url = self.get_argument('url')
if url:
Source=Tieba.ChangeUrl(Tieba.Analyze(url,'all'))
if Source:
self.write('<p><center>共爬了'+str(Tieba.PageCount)+'页<br>')
self.write('找到了好多字,累死了喵</center></p>')
self.write("<center><textarea rows='15' cols='111'>"+Source+"</textarea></center>")
else: self.write('<center><p>喵的,请输入正确的贴吧地址(╯‵□′)╯︵┴─┴</p></center>')
settings = {
'template_path' : os.path.join(os.path.dirname(__file__), 'template'),
'debug' : True,
'gzip' : True,
}
app = tornado.wsgi.WSGIApplication(
[('/', MainHandler),
('/img', Img),
('/imghtml',ImgHtml),
('/all', All),
('/allhtml',AllHtml)],
**settings
)
[/code]
本文链接地址: Python练手:贴吧爬虫
写这东西之前,要有很好的前端知识吧
不是必要的
不过可以做得非常漂亮