python - Web spider not returning all results -
if here not 2 different spiders automatically add results mysql database. i've added if , elif statement , work miss out results, there 52 rows in bath table, there 41. bristol used have 154 141. cannot think why results not same.
pipelines.py
import sys import mysqldb import mysqldb.cursors import hashlib scrapy.exceptions import dropitem scrapy.http import request class testpipeline(object): def __init__(self): self.conn = mysqldb.connect( user='user', passwd='pwd', db='db', host='host', charset='utf8', use_unicode=true ) self.cursor = self.conn.cursor() def process_item(self, item, spider): try: if 'bristolqualification' in item: self.cursor.execute("""insert bristol(bristolcountry, bristolqualification) values ('{0}', '{1}')""".format(item['bristolcountry'], "".join([s.encode('utf8') s in item['bristolqualification']]))) elif 'bathqualification' in item: self.cursor.execute("""insert bath(bathcountry, bathqualification) values ('{0}', '{1}')""".format(item['bathcountry'], "".join([s.encode('utf8') s in item['bathqualification']]))) self.conn.commit() return item except mysqldb.error e: print "error %d: %s" % (e.args[0], e.args[1])
items.py
from scrapy.item import item, field class qualificationitem(item): bristolqualification = field() bristolcountry = field() bathqualification = field() bathcountry = field()
bristol.py
from scrapy.spider import basespider project.items import qualificationitem scrapy.selector import htmlxpathselector scrapy.http.request import request urlparse import urljoin user_agent = 'mozilla/5.0 (x11; linux x86_64; rv:27.0) gecko/20100101 firefox/27.0' class recursivespider(basespider): name = 'bristol' allowed_domains = ['bristol.ac.uk/'] start_urls = ['http://www.bristol.ac.uk/international/countries/'] def parse(self, response): hxs = htmlxpathselector(response) xpath = '//*[@id="all-countries"]/li/ul/li/a/@href' a_of_the_link = '//*[@id="all-countries"]/li/ul/li/a/text()' text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()): yield request(urljoin(response.url, link), meta={'a_of_the_link': text}, headers={'user-agent': user_agent}, callback=self.parse_linkpage, dont_filter=true) def parse_linkpage(self, response): hxs = htmlxpathselector(response) item = qualificationitem() xpath = """ //h2[normalize-space(.)="entry requirements undergraduate courses"] /following-sibling::p[not(preceding-sibling::h2[normalize-space(.)!="entry requirements undergraduate courses"])] """ item['bristolqualification'] = hxs.select(xpath).extract()[1:] item['bristolcountry'] = response.meta['a_of_the_link'] return item
if here user did try fix problem unsuccessful haven't heard him since.
'these errors caused unescaped single quotes in bristolqualification item field (and presumably bath spider suffers same problem) causing havoc (such d'etudes in snippet below):'
this thought problem was.
can see problem at?
Comments
Post a Comment