python - Web spider not returning all results -

- February 15, 2011

if here not 2 different spiders automatically add results mysql database. i've added if , elif statement , work miss out results, there 52 rows in bath table, there 41. bristol used have 154 141. cannot think why results not same.

pipelines.py

import sys import mysqldb import mysqldb.cursors import hashlib scrapy.exceptions import dropitem scrapy.http import request  class testpipeline(object):  def __init__(self):     self.conn = mysqldb.connect(         user='user',         passwd='pwd',         db='db',         host='host',         charset='utf8',         use_unicode=true         )     self.cursor = self.conn.cursor()  def process_item(self, item, spider):     try:         if 'bristolqualification' in item:             self.cursor.execute("""insert bristol(bristolcountry, bristolqualification) values ('{0}', '{1}')""".format(item['bristolcountry'], "".join([s.encode('utf8') s in item['bristolqualification']])))         elif 'bathqualification' in item:             self.cursor.execute("""insert bath(bathcountry, bathqualification) values ('{0}', '{1}')""".format(item['bathcountry'], "".join([s.encode('utf8') s in item['bathqualification']])))         self.conn.commit()         return item      except mysqldb.error e:         print "error %d: %s" % (e.args[0], e.args[1])

items.py

from scrapy.item import item, field  class qualificationitem(item): bristolqualification = field() bristolcountry = field() bathqualification = field() bathcountry = field()

bristol.py

from scrapy.spider import basespider project.items import qualificationitem scrapy.selector import htmlxpathselector scrapy.http.request import request urlparse import urljoin  user_agent = 'mozilla/5.0 (x11; linux x86_64; rv:27.0) gecko/20100101 firefox/27.0'  class recursivespider(basespider): name = 'bristol' allowed_domains = ['bristol.ac.uk/'] start_urls = ['http://www.bristol.ac.uk/international/countries/']  def parse(self, response):     hxs = htmlxpathselector(response)      xpath = '//*[@id="all-countries"]/li/ul/li/a/@href'     a_of_the_link = '//*[@id="all-countries"]/li/ul/li/a/text()'     text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):         yield request(urljoin(response.url, link),         meta={'a_of_the_link': text},         headers={'user-agent': user_agent},         callback=self.parse_linkpage,         dont_filter=true)  def parse_linkpage(self, response):     hxs = htmlxpathselector(response)     item = qualificationitem()     xpath = """             //h2[normalize-space(.)="entry requirements undergraduate courses"]              /following-sibling::p[not(preceding-sibling::h2[normalize-space(.)!="entry requirements undergraduate courses"])]             """     item['bristolqualification'] = hxs.select(xpath).extract()[1:]     item['bristolcountry'] = response.meta['a_of_the_link']     return item

if here user did try fix problem unsuccessful haven't heard him since.

'these errors caused unescaped single quotes in bristolqualification item field (and presumably bath spider suffers same problem) causing havoc (such d'etudes in snippet below):'

this thought problem was.

can see problem at?

Search This Blog

HR

python - Web spider not returning all results -

pipelines.py

items.py

bristol.py

Comments

Post a Comment

Popular posts from this blog

c++ - How to add Crypto++ library to Qt project -

jQuery Mobile app not scrolling in Firefox -

how to receive file in java(servlet/jsp) -