python - Web spider not returning all results -


if here not 2 different spiders automatically add results mysql database. i've added if , elif statement , work miss out results, there 52 rows in bath table, there 41. bristol used have 154 141. cannot think why results not same.

pipelines.py

import sys import mysqldb import mysqldb.cursors import hashlib scrapy.exceptions import dropitem scrapy.http import request  class testpipeline(object):  def __init__(self):     self.conn = mysqldb.connect(         user='user',         passwd='pwd',         db='db',         host='host',         charset='utf8',         use_unicode=true         )     self.cursor = self.conn.cursor()  def process_item(self, item, spider):     try:         if 'bristolqualification' in item:             self.cursor.execute("""insert bristol(bristolcountry, bristolqualification) values ('{0}', '{1}')""".format(item['bristolcountry'], "".join([s.encode('utf8') s in item['bristolqualification']])))         elif 'bathqualification' in item:             self.cursor.execute("""insert bath(bathcountry, bathqualification) values ('{0}', '{1}')""".format(item['bathcountry'], "".join([s.encode('utf8') s in item['bathqualification']])))         self.conn.commit()         return item      except mysqldb.error e:         print "error %d: %s" % (e.args[0], e.args[1]) 

items.py

from scrapy.item import item, field  class qualificationitem(item): bristolqualification = field() bristolcountry = field() bathqualification = field() bathcountry = field() 

bristol.py

from scrapy.spider import basespider project.items import qualificationitem scrapy.selector import htmlxpathselector scrapy.http.request import request urlparse import urljoin  user_agent = 'mozilla/5.0 (x11; linux x86_64; rv:27.0) gecko/20100101 firefox/27.0'  class recursivespider(basespider): name = 'bristol' allowed_domains = ['bristol.ac.uk/'] start_urls = ['http://www.bristol.ac.uk/international/countries/']  def parse(self, response):     hxs = htmlxpathselector(response)      xpath = '//*[@id="all-countries"]/li/ul/li/a/@href'     a_of_the_link = '//*[@id="all-countries"]/li/ul/li/a/text()'     text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):         yield request(urljoin(response.url, link),         meta={'a_of_the_link': text},         headers={'user-agent': user_agent},         callback=self.parse_linkpage,         dont_filter=true)  def parse_linkpage(self, response):     hxs = htmlxpathselector(response)     item = qualificationitem()     xpath = """             //h2[normalize-space(.)="entry requirements undergraduate courses"]              /following-sibling::p[not(preceding-sibling::h2[normalize-space(.)!="entry requirements undergraduate courses"])]             """     item['bristolqualification'] = hxs.select(xpath).extract()[1:]     item['bristolcountry'] = response.meta['a_of_the_link']     return item 

if here user did try fix problem unsuccessful haven't heard him since.

'these errors caused unescaped single quotes in bristolqualification item field (and presumably bath spider suffers same problem) causing havoc (such d'etudes in snippet below):'

this thought problem was.

can see problem at?


Comments

Popular posts from this blog

c++ - How to add Crypto++ library to Qt project -

jQuery Mobile app not scrolling in Firefox -

how to receive file in java(servlet/jsp) -