Python实现网络爬虫、蜘蛛.pdf

资源描述

1、python 中如何提取网页正文啊谢谢import urllib.request url=“http:/ response=urllib.request.urlopen(url) page=response.read() python 提取网页中的文本1. import os,sys,datetime 2. import httplib,urllib, re 3. from sgmllib import SGMLParser 4.5. import types 6.7. class Html2txt(SGMLParser): 8. def reset(self): 9. self.text

2、= 10. self.inbody = True 11. SGMLParser.reset(self) 12. def handle_data(self,text): 13. if self.inbody: 14. self.text += text 15. 16. def start_head(self,text): 17. self.inbody = False 18. def end_head(self): 19. self.inbody = True 20. 21. 22.if _name_ = “_main_“: 23. parser = Html2txt() 24. parser.

3、feed(urllib.urlopen(“http:/“).read() 25. parser.close() 26. print parser.text.strip() python 下载网页import httplib conn=httplib.HTTPConnection(“ “) conn.request(“GET“,“/index.html“) r1=conn.getresponse() print r1.status,r1.reason data=r1.read() print data conn.close 用 python 下载网页，超级简单！from urllib impo

4、rt urlopen webdata = urlopen(“).read() print webdata 深入 python 里面有python 下载网页内容 ,用 python 的 pycurl 模块实现1. 用 python 下载网页内容还是很不错的，之前是使用 urllib 模块实验的，但听说有 pycurl 这个模块，而且比 urllib 好，所以尝试下，废话不说，以下是代码2.3.4. #!/usr/bin/env python 5. # -*- coding: utf-8 -*- 6. import StringIO 7. import pycurl 8.9. def writef

5、ile(fstr,xfilename): f=open(xfilename,w) f.write(fstr) f.close 10.1. html = StringIO.StringIO() 2. c = pycurl.Curl() 3. myurl= http:/ 4.5. c.setopt(pycurl.URL, myurl) 6.7. #写的回调8. c.setopt(pycurl.WRITEFUNCTION, html.write) 9.10.c.setopt(pycurl.FOLLOWLOCATION, 1) 11.12.#最大重定向次数 ,可以预防重定向陷阱13.c.setopt(

6、pycurl.MAXREDIRS, 5) 14.15.#连接超时设置16.c.setopt(pycurl.CONNECTTIMEOUT, 60) 17.c.setopt(pycurl.TIMEOUT, 300) 18.19.#模拟浏览器20.c.setopt(pycurl.USERAGENT, “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)“) 21.22.23.24.#访问 ,阻塞到访问结束25.c.perform() 26.27.#打印出 200(HTTP 状态码，可以不需要 ) 28.

7、print c.getinfo(pycurl.HTTP_CODE) 29.30.#输出网页的内容31.print html.getvalue() 32.#保存成 down.txt 文件33.writefile(html.getvalue(),“down.txt“) python 的 pycurl 模块的安装可以到 http:/ 这里去找 . 不同系统使用不同版本，自己看看总结下， Python 下载网页的几种方法1 fd = urllib2.urlopen(url_link) data = fd.read() 这是最简洁的一种，当然也是 Get 的方法2 通过 GET的方法def GetHtm

8、lSource(url): try: htmSource = req = urllib2.Request(url) fd = urllib2.urlopen(req,“) while 1: data = fd.read(1024) if not len(data): break htmSource += data fd.close() del fd del req htmSource = htmSource.decode(cp936) htmSource = formatStr(htmSource) return htmSource except socket.error, err: str_

9、err = “%s“ % err return “ 3 通过 GET的方法def GetHtmlSource_Get(htmurl): htmSource = “ try: urlx = httplib.urlsplit(htmurl) conn = httplib.HTTPConnection(loc) conn.connect() conn.putrequest(“GET“, htmurl, None) conn.putheader(“Content-Length“, 0) conn.putheader(“Connection“, “close“) conn.endheaders() re

10、s = conn.getresponse() htmSource = res.read() except Exception(), err: trackback.print_exec() conn.close() return htmSource 通过 POST的方法def GetHtmlSource_Post(getString): htmSource = “ try: url = httplib.urlsplit(“http:/:8080“) conn = httplib.HTTPConnection(loc) conn.connect() conn.putrequest(“POST“,

11、“/sipo/zljs/hyjs-jieguo.jsp“) conn.putheader(“Content-Length“, len(getString) conn.putheader(“Content-Type“, “application/x-www-form-urlencoded“) conn.putheader(“Connection“, “ Keep-Alive“) conn.endheaders() conn.send(getString) f = conn.getresponse() if not f: raise socket.error, “timed out“ htmSou

12、rce = f.read() f.close() conn.close() return htmSource except Exception(), err: trackback.print_exec() conn.close() return htmSource 本文来自 CSDN 博客，转载请标明出处：http:/ 2010/04/29/5538065.aspxDjango+python+BeautifulSoup 组合的垂直搜索爬虫使用 python+BeautifulSoup 完成爬虫抓取特定数据的工作，并使用 Django 搭建一个管理平台，用来协调抓取工作

13、。因为自己很喜欢 Django admin 后台，所以这次用这个后台对抓取到的链接进行管理，使我的爬虫可以应对各种后期的需求。比如分时段抓取，定期的对已经抓取的地址重新抓取。数据库是用 python 自带的 sqlite3 ，所以很方便。这几天正好在做一个电影推荐系统，需要些电影数据。本文的例子是对豆瓣电影抓取特定的数据。第一步：建立 Django 模型模仿 nutch 的爬虫思路，这里简化了。每次抓取任务开始先从数据库里找到未保存的(is_save = False)的链接，放到抓取链表里。你也可以根据自己的需求去过滤链接。python 代码：view plaincopy to clipboa

14、rdprint? 01.class Crawl_URL(models.Model): 02. url = models.URLField( 抓取地址 ,max_length=100, unique=True) 03. weight = models.SmallIntegerField( 抓取深度 ,default = 0)# 抓取深度起始 1 04. is_save = models.BooleanField(是否已保存 ,default= False)# 05. date = models.DateTimeField( 保存时间 ,auto_now_add=True,blank=True,n

15、ull=True) 06. def _unicode_(self): 07. return self.url class Crawl_URL(models.Model): url = models.URLField(抓取地址 ,max_length=100, unique=True) weight = models.SmallIntegerField( 抓取深度 ,default = 0)# 抓取深度起始 1 is_save = models.BooleanField(是否已保存 ,default= False)# date = models.DateTimeField( 保存时间 ,auto

16、_now_add=True,blank=True,null=True) def _unicode_(self): return self.url 然后生成相应的表。还需要一个 admin 管理后台view plaincopy to clipboardprint? 01.class Crawl_URLAdmin(admin.ModelAdmin): 02. list_display = (url,weight,is_save,date,) 03. ordering = (-id,) 04. list_filter = (is_save,weight,date,) 05. fields = (ur

17、l,weight,is_save,) 06.admin.site.register(Crawl_URL, Crawl_URLAdmin) class Crawl_URLAdmin(admin.ModelAdmin): list_display = (url,weight,is_save,date,) ordering = (-id,) list_filter = (is_save,weight,date,) fields = (url,weight,is_save,) admin.site.register(Crawl_URL, Crawl_URLAdmin) 第二步，编写爬虫代码爬虫是单线程

18、，并且每次抓取后都有相应的暂定，豆瓣网会禁止一定强度抓取的爬虫爬虫根据深度来控制，每次都是先生成链接，然后抓取，并解析出更多的链接，最后将抓取过的链接 is_save=true，并把新链接存入数据库中。每次一个深度抓取完后都需要花比较长的时候把链接导入数据库。因为需要判断链接是否已存入数据库。这个只对满足正则表达式 http:/ 的地址进行数据解析。并且直接忽略掉不是电影模块的链接。第一次抓取需要在后台加个链接，比如 http:/ ，这是个排行榜的页面，电影比较受欢迎。python 代码：#这段代码不能格式化发# coding=UTF-8 import urllib2 from Beautif

19、ulSoup import * from urlparse import urljoin from pysqlite2 import dbapi2 as sqlite from movie.models import * from django.contrib.auth.models import User from time import sleep image_path = C:/Users/soul/djcodetest/picture/ user = User.objects.get(id=1) def crawl(depth=10): for i in range(1,depth):

20、 print 开始抓取 for %d.%i pages = Crawl_URL.objects.filter(is_save=False) newurls= for crawl_page in pages: page = crawl_page.url try: c=urllib2.urlopen(page) except: continue try: #解析元数据和 url soup=BeautifulSoup(c.read() #解析电影页面if re.search(rhttp:/ read_html(soup) #解析出有效的链接，放入 newurls links=soup(a) for

21、link in links: if href in dict(link.attrs): url=urljoin(page,linkhref) if url.find(“)!=-1: continue if len(url) 60: continue url=url.split(#)0 # removie location portion if re.search(rhttp:/, url): newurlsurl= crawl_page.weight + 1 # 连接有效。存入字典中try: print add url : except: pass except Exception.args:

22、 try: print “Could not parse : %s“ % args except: pass #newurls 存入数据库 is_save=False weight=i crawl_page.is_save = True crawl_page.save() #休眠 2.5 秒sleep(2.5) save_url(newurls) #保存 url ，放到数据库里def save_url(newurls): for (url,weight) in newurls.items(): url = Crawl_URL(url=url,weight=weight) try: url.sa

23、ve() except: try: print url 重复 : except: pass return True 第三步，用 BeautifulSoup 解析页面抽取出电影标题，图片，剧情介绍，主演，标签，地区。关于 BeautifulSoup 的使用可以看这里 BeautifulSoup 技术文档view plaincopy to clipboardprint?01.# 抓取数据 02.def read_html(soup): 03. #解析出标题 04. html_title = soup.html.head.title.string 05. title = html_title:le

24、n(html_title)-5 06. #解析出电影介绍 07. try: 08. intro = soup.find(span,attrs=class:all hidden).text 09. except: 10. try: 11. node = soup.find(div,attrs=class:blank20).previousSibling 12. intro = node.contents0+node.contents2 13. except: 14. try: 15. contents = soup.find(div,attrs=class:blank20).previousSi

25、bling.previousSibling.text 16. intro = contents:len(contents)-22 17. except: 18. intro = u 暂无 19. 20. # 取得图片 21. html_image = soup(a,href=pile( 22. data = urllib2.urlopen(html_image).read() 23. image = 201003/+html_imagehtml_image.rfind(/)+1: 24. f = file(image_path+image,wb) 25. f.write(data) 2

26、6. f.close() 27. 28. 29. #解析出地区 30. try: 31. soup_obmo = soup.find(div,attrs=class:obmo).findAll(span) 32. html_area = soup_obmo0.nextSibling.split(/) 33. area = html_area0.lstrip() 34. except: 35. area = 36. 37. #time = soup_obmo1.nextSibling.split( )1 38. #time = time.strptime(html_time,%Y-%m-%d)

27、39. 40. #生成电影对象 41. new_movie = Movie(title=title,intro=intro,area=area,version= 暂无,upload_user=user,image=image) 42. new_movie.save() 43. try: 44. actors = soup.find(div,attrs=id:info).findAll(span)5.nextSibling.nextSibling.string.split( )0 45. actors_list = Actor.objects.filter(name = actors) 46. if len(actors_list) =

展开阅读全文