3D grphique: i want to traverse over all the pages using "NEXT" button link to get all the items

vendredi 4 avril 2014

i want to traverse over all the pages using "NEXT" button link to get all the items

Vote count:

0

from scrapy.http import Request

from scrapy.selector import HtmlXPathSelector

from scrapy.contrib.spiders import CrawlSpider,Rule

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from walmart_sample.items import WalmartSampleItem

class MySpider(CrawlSpider):


name = "my_spider"
domain = ['Apparel']
keyword = 'Bags'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
rules = (Rule(SgmlLinkExtractor(allow=("http://ift.tt/1he86Qp" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[@class="btn-nextResults"]'),callback='parse',follow=True),)


def start_requests(self):
    for domains in self.domain:
        if domains in self.departments:
            url = 'http://ift.tt/1hpCIbG' % (self.keyword, self.departments.get(domains))
            yield Request(url)


def parse(self, response):
    hxs = HtmlXPathSelector(response)
    links = hxs.select('//a[@class="prodLink ListItemLink"]/@href')
    last = hxs.select('//a[@class="SPPagNoLink jump next"]').extract()
    if last is None:
        for link in links:
            href = link.extract()
            yield Request('http://www.walmart.com/' + href, self.parse_data) 
    else:
        print "<<<<<Last Page>>>>>>"



def parse_data(self, response):
    hxs = HtmlXPathSelector(response)
    items=[]
    walmart=WalmartSampleItem()
    walmart['Title']=hxs.select('//h1[@class="productTitle"]/text()').extract()
    walmart['Price']=hxs.select('//span[@class="bigPriceText1"]/text()').extract()+hxs.select('//span[@class="smallPriceText1"]/text()').extract()
    walmart['Availability']=hxs.select('//span[@id="STORE_AVAIL"]/text()').extract()
    walmart['Description']=hxs.select('//span[@class="ql-details-short-desc"]/p/text()').extract()
    #walmart['Avg_Rating']=
    #walmart['Detailed_Rating']=
    items.append(walmart)
    return items

asked 1 min ago

user3488659

13

3D grphique

vendredi 4 avril 2014

i want to traverse over all the pages using "NEXT" button link to get all the items

Vote count:

0

Aucun commentaire:

Enregistrer un commentaire

vendredi 4 avril 2014

i want to traverse over all the pages using "NEXT" button link to get all the items

Vote count: 0

Aucun commentaire:

Enregistrer un commentaire

Vote count:

0