def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhuanyeItem(), response)
loader.add_value('url', response.url)
loader.add_css('name', u'.majorTitle>h1::text')
loader.add_xpath('code', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('degree', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('period', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('courses', u'//div[@class="course"]/h3[.="?????"]/following-sibling::p/text()')
def parse_related():
for e in response.xpath(u'//div[@class="course"]/h3[.="?????"]/following-sibling::a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+).html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('related', list(parse_related()))
def parse_category():
category = []
for i in [u"????", u"????", u"????"]:
x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a'.format(i)
e = response.xpath(x)
category.append({
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'/zhuanye([-0-9]*).html').strip('-'),
'name': e.css('::text').extract_first(),
})
return category
loader.add_value('category', parse_category())
loader.add_css('detail', u'.majorCon')
item = loader.load_item()
return Request(
url='http://www.gaokaopai.com/zhuanye-jiuye-{}.html'.format(item['code'][0]),
meta={'item': item},
callback=self.parse_jiuye
)