
python 关键词 抓取网页_使用Scrapy抓取网站,只抓取包含关键字的页面_张昕宇梁红

未知 2782


我想我想要么I)删除对__init__函数中的LinkExtractor的调用,要么ii)只从__init__内调用{},但规则基于我访问该页面时发现的内容,而不是URL的某个属性。我做不到I)因为CrawlSpider类需要一个规则,而我做不到ii)因为LinkExtractor没有一个{}选项,就像旧的SgmlLinkExtractor一样,它似乎已经被弃用了。我刚接触scray,所以想知道我的唯一选择是写我自己的LinkExtractor?在from scrapy.crawler import Crawler

from scrapy.contrib.loader import ItemLoader

from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst

from scrapy.contrib.linkextractors import LinkExtractor

from scrapy.contrib.spiders import CrawlSpider, Rule

from scrapy import log, signals, Spider, Item, Field

from scrapy.settings import Settings

from twisted.internet import reactor

# define an item class

class GenItem(Item):

url = Field()

# define a spider

class GenSpider(CrawlSpider):

name = "genspider3"

# requires 'start_url', 'allowed_domains' and 'folderpath' to be passed as string arguments IN THIS PARTICULAR ORDER!!!

def __init__(self):

self.start_urls = [sys.argv[1]]

self.allowed_domains = [sys.argv[2]]

self.folder = sys.argv[3]

self.writefile1 = self.folder + 'hotlinks.txt'

self.writefile2 = self.folder + 'pages.txt'

self.rules = [Rule(LinkExtractor(allow_domains=(sys.argv[2],)), follow=True, callback='parse_links')]

super(GenSpider, self).__init__()

def parse_start_url(self, response):

# get list of links on start_url page and process using parse_links


def parse_links(self, response):

# if this page contains a word of interest save the HTML to file and crawl the links on this page

theHTML = response.body

if 'keyword' in theHTML:

with open(self.writefile2, 'a+') as f2:

f2.write(theHTML + '\n')

with open(self.writefile1, 'a+') as f1:

f1.write(response.url + '\n')

for link in LinkExtractor(allow_domains=(sys.argv[2],)).extract_links(response):

linkitem = GenItem()

linkitem['url'] = link.url


with open(self.writefile1, 'a+') as f1:

f1.write(link.url + '\n')

return linkitem

# callback fired when the spider is closed

def callback(spider, reason):

stats = spider.crawler.stats.get_stats() # collect/log stats?

# stop the reactor


# instantiate settings and provide a custom configuration

settings = Settings()

#settings.set('DEPTH_LIMIT', 2)

settings.set('DOWNLOAD_DELAY', 0.25)

# instantiate a crawler passing in settings

crawler = Crawler(settings)

# instantiate a spider

spider = GenSpider()

# configure signals

crawler.signals.connect(callback, signal=signals.spider_closed)

# configure and start the crawler




# start logging


# start the reactor (blocks execution)


标签: #python抓网页关键词