Vote count:
0
I am parsing the following code using a regex (not ideal I know, but that is a story for another day):
data:{
url: 'stage-team-stat'
},
defaultParams: {
stageId : 9155,
field: 2,
teamId: 26
}
};
This is being parsed from the web page embedded in the following Scrapy code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.cmdline import execute
from scrapy.utils.markup import remove_tags
import time
import re
import json
import requests
class ExampleSpider(CrawlSpider):
name = "goal2"
allowed_domains = ["whoscored.com"]
start_urls = ["http://ift.tt/1qMIZYh"]
download_delay = 5
rules = [Rule(SgmlLinkExtractor(allow=('http://ift.tt/1qMIZYh'),deny=('/News', '/Graphics', '/Articles', '/Live', '/Matches', '/Explanations', '/Glossary', 'ContactUs', 'TermsOfUse', 'Jobs', 'AboutUs', 'RSS'),), follow=False, callback='parse_item')]
def parse_item(self, response):
sel = Selector(response)
titles = sel.xpath("normalize-space(//title)")
print '-' * 170
myheader = titles.extract()[0]
print '********** Page Title:', myheader.encode('utf-8'), '**********'
print '-' * 170
stagematch = re.compile("data:\s*{\s*url:\s*'stage-team-stat'\s*},\s*defaultParams:\s*{\s*(.*?),.*},",re.S)
stagematch2 = re.search(stagematch, response.body)
if stagematch2 is not None:
stagematch3 = stagematch2.group(1)
stageid = int(stagematch3.split(':', 1)[1])
stageid = str(stageid)
teamid = int(stagematch3.split(':', 3)[1])
teamid = str(teamid)
print stageid
print teamid
In this example I would expect stageid
to be '9155' and teamid
to be '32', however they are both coming back as '9155'.
Can anyone see what I am doing wrong?
Thanks
asked 11 secs ago
Obtaining correct data using Regex/List
Aucun commentaire:
Enregistrer un commentaire