DekGenius.com
PYTHON
scrapy itemloader example
def parse_question ( self, response) :
if "QuestionHeader-title" in response. text:
match_obj = re. match ( "(.*zhihu.com/question/(d+))(/|$).*" , response. url)
if match_obj:
question_id = int ( match_obj. group( 2 ) )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_css( "title" , "h1.QuestionHeader-title::text" )
item_loader. add_css( "content" , ".QuestionHeader-detail" )
item_loader. add_value( "url" , response. url)
item_loader. add_value( "zhihu_id" , question_id)
item_loader. add_css( "answer_num" , ".List-headerText span::text" )
item_loader. add_css( "comments_num" , ".QuestionHeader-Comment button::text" )
item_loader. add_css( "watch_user_num" , ".NumberBoard-value::text" )
item_loader. add_css( "topics" , ".QuestionHeader-topics .Popover div::text" )
question_item = item_loader. load_item( )
else :
match_obj = re. match ( "(.*zhihu.com/question/(d+))(/|$).*" , response. url)
if match_obj:
question_id = int ( match_obj. group( 2 ) )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_xpath( "title" ,
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" )
item_loader. add_css( "content" , "#zh-question-detail" )
item_loader. add_value( "url" , response. url)
item_loader. add_value( "zhihu_id" , question_id)
item_loader. add_css( "answer_num" , "#zh-question-answer-num::text" )
item_loader. add_css( "comments_num" , "#zh-question-meta-wrap a[name='addcomment']::text" )
item_loader. add_xpath( "watch_user_num" ,
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" )
item_loader. add_css( "topics" , ".zm-tag-editor-labels a::text" )
question_item = item_loader. load_item( )
yield scrapy. Request( self. start_answer_url. format ( question_id, 20 , 0 ) , headers= self. headers,
callback= self. parse_answer)
yield question_item
scrapy itemloader example
def parse_news_metro ( self, response) :
loader = ItemLoader( item= News( ) , response= response)
loader. add_value( 'url' , response. url)
date_selector = response. css( '.artikel > div.block-tanggal::text' )
if not date_selector:
return self. parse_news_pilkada( loader, response)
try :
date_time_str = date_selector. extract( ) [ 0 ] . split( ',' ) [ 1 ] . strip( ) [ : - 4 ]
date_time_str = ' ' . join( [ _( x) for x in date_time_str. split( ' ' ) ] )
published_at_wib = datetime. strptime( date_time_str, '%d %B %Y | %H:%M' )
except Exception:
return loader. load_item( )
published_at = wib_to_utc( published_at_wib)
if ( self. media[ 'last_scraped_at' ] >= published_at) :
is_no_update = True
self. logger. info( 'Media have no update' )
raise CloseSpider( 'finished' )
loader. add_value( 'published_at' , published_at)
title_selector = response. css( '.artikel > h1::text' )
if not title_selector:
return loader. load_item( )
loader. add_value( 'title' , title_selector. extract( ) [ 0 ] )
raw_content_selector = response. xpath( '//div[@class="artikel"]//p[not(iframe)]' )
if not raw_content_selector:
return loader. load_item( )
raw_content = ''
for rsl in raw_content_selector:
raw_content = raw_content + rsl. extract( ) . strip( )
next_page_selector = response. css( '.pagination-nb' ) . xpath( '//a[text()="next"]/@href' )
if next_page_selector:
return Request( next_page_selector. extract( ) [ 0 ] , callback= lambda x, loader= loader, raw_content= raw_content: self. parse_next_page_metro( x, loader, raw_content) )
loader. add_value( 'raw_content' , raw_content)
author_name = ''
for author_name_selector in reversed ( raw_content_selector) :
author_name_selector = author_name_selector. css( 'strong::text' )
for tmp in reversed ( author_name_selector. extract( ) ) :
tmp = tmp. strip( )
if tmp and all ( ( x. isalpha( ) and x. isupper( ) ) or x. isspace( ) or x == '.' or x == '|' for x in tmp) :
author_name = tmp
break
if author_name:
break
author_name = ',' . join( author_name. split( ' | ' ) )
loader. add_value( 'author_name' , author_name)
return loader. load_item( )
scrapy itemloader example
def parse_item ( self, response) :
loader = ItemLoader( EolZhiyeItem( ) , response)
loader. add_value( 'url' , response. url)
loader. add_value( 'code' , response. url, re= r'/(w+).shtml' )
loader. add_css( 'name' , 'h1#pagetitle::text' )
loader. add_xpath( 'category' , u'//div[@id="precontent"]/p[contains(., "??")]/a/text()' )
loader. add_xpath( 'category2' , u'//div[@id="precontent"]/p[contains(., "??")]/a/text()' )
loader. add_xpath( 'detail' , u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]' , Join( '
') )
yield loader. load_item( )
scrapy itemloader example
def parse_book ( self, response) :
book_loader = ItemLoader( item= BookItem( ) , response= response)
book_loader. default_input_processor = MapCompose( remove_tags)
book_loader. default_output_processor = TakeFirst( )
book_loader. add_xpath( "title" , "//div[@class='col-sm-6 product_main']/h1" )
book_loader. add_xpath( "price" , "//p[@class='price_color']" )
book_loader. add_xpath( "upc" , "//table[@class='table table-striped']/tr[1]/td" )
book_loader. add_xpath( "product_type" , "//table[@class='table table-striped']/tr[2]/td" )
book_loader. add_xpath( "tax" , "//table[@class='table table-striped']/tr[5]/td" )
book_loader. add_xpath( "stock" , "//table[@class='table table-striped']/tr[6]/td" )
book_loader. add_xpath( "reviews" , "//table[@class='table table-striped']/tr[7]/td" )
book_loader. add_xpath( "rating" , "//p[@class='instock availability']/following-sibling::p/@class" )
yield book_loader. load_item( )
scrapy itemloader example
def parse_item ( self, response) :
loader = ItemLoader( GaokaopaiZhiyeItem( ) , response)
loader. add_value( 'url' , response. url)
loader. add_value( 'code' , response. url, re= ur'-([^-]+).html' )
loader. add_css( 'name' , u'.modTitle>h1::text' )
def parse_category ( ) :
for e in response. css( u'.catType>a' ) :
yield {
'url' : e. css( '::attr(href)' ) . extract_first( ) ,
'code' : e. css( '::attr(href)' ) . re_first( ur'-([^-]+).html' ) ,
'name' : e. css( '::text' ) . extract_first( ) ,
}
loader. add_value( 'category' , list ( parse_category( ) ) )
loader. add_css( 'detail' , u'.zhiyeShow' )
item = loader. load_item( )
return FormRequest(
url= 'http://www.gaokaopai.com/ajax-career-getRelateMajor.html' ,
formdata= { 'code' : item[ 'code' ] [ 0 ] } ,
meta= { 'item' : item} ,
dont_filter= True ,
callback= self. parse_majors
)
scrapy itemloader example
def parse_song_list ( self, response) :
selector = Selector( response)
song_name_list = selector. xpath( '//body//ul[@class="f-hide"]/li/a/text()' ) . extract( )
song_id_list = selector. xpath( '//body//ul[@class="f-hide"]/li/a/@href' ) . extract( )
title = selector. xpath( '//title/text()' ) . extract( )
for index, id_ in enumerate ( song_id_list) :
l = ItemLoader( item= PlayListItem( ) )
l. add_value( 'song_name' , song_name_list[ index] )
l. add_value( 'title' , title)
yield scrapy. FormRequest( url= self. BASE_URL + id_, meta= { 'song_id' : id_[ 9 : ] , 'loader' : l} , method= 'GET' ,
headers= self. headers, callback= self. parse_single_song)
scrapy itemloader example
def parse_item ( self, response) :
l = ItemLoader( item= PageItem( ) , response= response)
l. add_value( 'title' , response. request. cookies[ 'title' ] )
l. add_value( 'name' , self. name)
l. add_value( 'url' , response. url)
l. add_xpath( 'image_urls' , '//p[@id="contents"]/a/img/@src' )
return l. load_item( )
scrapy itemloader example
def parse_item ( self, response) :
l = ItemLoader( item= PageItem( ) , response= response)
l. add_value( 'title' , response. request. cookies[ 'title' ] )
l. add_value( 'name' , self. name)
l. add_value( 'url' , response. url)
l. add_xpath( 'image_urls' , '//td[@valign="top"]/img/@src' )
return l. load_item( )
scrapy itemloader example
def parse ( self, response) :
l = ItemLoader( item= PlantItem( ) , response= response)
l. add_xpath( 'name' , "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()" )
l. add_xpath( 'species' , "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()" )
l. add_xpath( 'key' , "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()" )
l. add_xpath( 'value' , "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()" )
return l. load_item( )
scrapy itemloader example
def parse_item ( self, response) :
il = ItemLoader( item= ImageItem( ) , response= response)
il. add_css( 'image_urls' , 'img::attr(src)' )
return il. load_item( )
scrapy itemloader example
def parse_question ( self, response) :
if "QuestionHeader-title" in response. text:
match_obj = re. match ( "(.*zhihu.com/question/(d+))(/|$).*" , response. url)
if match_obj:
question_id = int ( match_obj. group( 2 ) )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_css( "title" , "h1.QuestionHeader-title::text" )
item_loader. add_css( "content" , ".QuestionHeader-detail" )
item_loader. add_value( "url" , response. url)
item_loader. add_value( "zhihu_id" , question_id)
item_loader. add_css( "answer_num" , ".List-headerText span::text" )
item_loader. add_css( "comments_num" , ".QuestionHeader-actions button::text" )
item_loader. add_css( "watch_user_num" , ".NumberBoard-value::text" )
item_loader. add_css( "topics" , ".QuestionHeader-topics .Popover div::text" )
question_item = item_loader. load_item( )
else :
match_obj = re. match ( "(.*zhihu.com/question/(d+))(/|$).*" , response. url)
if match_obj:
question_id = int ( match_obj. group( 2 ) )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_xpath( "title" ,
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" )
item_loader. add_css( "content" , "#zh-question-detail" )
item_loader. add_value( "url" , response. url)
item_loader. add_value( "zhihu_id" , question_id)
item_loader. add_css( "answer_num" , "#zh-question-answer-num::text" )
item_loader. add_css( "comments_num" , "#zh-question-meta-wrap a[name='addcomment']::text" )
item_loader. add_xpath( "watch_user_num" ,
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" )
item_loader. add_css( "topics" , ".zm-tag-editor-labels a::text" )
question_item = item_loader. load_item( )
yield scrapy. Request( self. start_answer_url. format ( question_id, 20 , 0 ) , headers= self. headers, callback= self. parse_answer)
yield question_item
scrapy itemloader example
def parse_news ( self, response) :
self. logger. info( 'parse_news: %s' % response)
parsed_news = json. loads( str ( response. body) ) [ 0 ]
loader = ItemLoader( item= News( ) , response= response)
loader. add_value( 'url' , parsed_news[ 'url' ] )
if not parsed_news[ 'title' ] :
return loader. load_item( )
loader. add_value( 'title' , parsed_news[ 'title' ] )
html_response = HtmlResponse( url= parsed_news[ 'url' ] ,
body= parsed_news[ 'content' ] . encode( 'utf-8' , 'ignore' ) )
xpath_query = '''
//body/node()
[not(descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::image|
descendant-or-self::img|
descendant-or-self::iframe
)]
'''
raw_content_selectors = html_response. xpath( xpath_query)
if not raw_content_selectors:
return loader. load_item( )
raw_content = raw_content_selectors. extract( )
raw_content = ' ' . join( [ w. strip( ) for w in raw_content] )
raw_content = raw_content. strip( )
loader. add_value( 'raw_content' , raw_content)
if not parsed_news[ 'published' ] :
return loader. load_item( )
date_time_str = ' ' . join( [ _( w) for w in parsed_news[ 'published' ] . split( ',' ) [ 1 ] . strip( ) [ : - 4 ] . split( ' ' ) ] )
try :
published_at_wib = datetime. strptime( date_time_str,
'%d %b %Y - %H:%M' )
except ValueError:
return loader. load_item( )
published_at = wib_to_utc( published_at_wib)
loader. add_value( 'published_at' , published_at)
if not parsed_news[ 'author' ] :
loader. add_value( 'author_name' , '' )
else :
loader. add_value( 'author_name' , parsed_news[ 'author' ] )
return loader. load_item( )
scrapy itemloader example
def parse_news ( self, response) :
self. logger. info( 'parse_news: %s' % response)
loader = ItemLoader( item= News( ) , response= response)
loader. add_value( 'url' , response. url)
title_selectors = response. css( 'h1.detailtitle::text' )
if not title_selectors:
return loader. load_item( )
title = title_selectors. extract_first( ) . strip( )
loader. add_value( 'title' , title)
date_time = response. css( 'body > div > div.container > div.page-header > div::text' ) . extract_first( ) . strip( )
date_time = date_time. split( ',' ) [ - 1 ] . strip( )
date_time = ' ' . join( [ _( w) for w in date_time. split( ' ' ) ] )
try :
published_at_wib = datetime. strptime( date_time, '%d %B %Y %H:%M' )
except ValueError:
return loader. load_item( )
published_at = wib_to_utc( published_at_wib)
loader. add_value( 'published_at' , published_at)
multipage_selectors = response. css( '.newsPagingWrap > a' )
if multipage_selectors:
return self. parse_indices( multipage_selectors, loader)
author_name_selectors = response. css( '.newsContent > p > strong::text' )
if not author_name_selectors:
loader. add_value( 'author_name' , '' )
else :
author_name = author_name_selectors. extract( ) [ - 1 ] . strip( )
loader. add_value( 'author_name' , author_name)
raw_content_selectors = response. css( '.newsContent > p' )
if not raw_content_selectors:
return loader. load_item( )
raw_content = ' ' . join( raw_content_selectors. extract( ) )
raw_content = raw_content. strip( )
loader. add_value( 'raw_content' , raw_content)
return loader. load_item( )
scrapy itemloader example
def parse_page ( self, response) :
l = ItemLoader( item= PageItem( ) , response= response)
l. add_value( 'title' , response. request. cookies[ 'title' ] )
l. add_value( 'name' , self. config[ "id" ] )
l. add_value( 'url' , response. url)
if self. config. has_key( "imageUrlReplacement" ) :
l. add_value( 'replace' , self. config[ "imageUrlReplacement" ] )
if self. config. has_key( "xpathImagesPath" ) :
l. add_xpath( 'image_urls' , self. config[ "xpathImagesPath" ] )
if self. config. has_key( "xpathFilesPath" ) :
l. add_xpath( 'file_urls' , self. config[ "xpathFilesPath" ] )
yield l. load_item( )
if self. config. has_key( "xpathNextImageUrl" ) :
nextUrls = response. xpath( self. config[ "xpathNextImageUrl" ] )
if len ( nextUrls) > 0 :
nextPage = nextUrls. extract( ) [ 0 ]
if not nextPage. startswith( "http" ) :
if nextPage. startswith( "/" ) :
nextPage = response. url[ 0 : response. url. index( "/" , 10 ) + 1 ] + nextPage
else :
nextPage = response. url[ 0 : response. url. rfind( "/" ) + 1 ] + nextPage
request = scrapy. Request( nextPage, callback= self. parse_page, cookies= { 'title' : response. request. cookies[ 'title' ] } )
yield request
scrapy itemloader example
def parse_news ( self, response) :
self. logger. info( 'parse_news: %s' % response)
loader = ItemLoader( item= News( ) , response= response)
json_response = json. loads( response. body)
try :
url = json_response[ 'NewsML' ] [ 'NewsItem' ] [ 'NewsComponent' ] [ 'NewsComponent' ] [ 'NewsComponent' ] [ 'NewsLines' ] [ 'MoreLink' ]
except KeyError:
return loader. load_item( )
loader. add_value( 'url' , url)
try :
title = json_response[ 'NewsML' ] [ 'NewsItem' ] [ 'NewsComponent' ] [ 'NewsComponent' ] [ 'NewsComponent' ] [ 'NewsLines' ] [ 'HeadLine' ]
except KeyError:
return loader. load_item( )
if not title:
return loader. load_item( )
loader. add_value( 'title' , title)
try :
raw_content = json_response[ 'NewsML' ] [ 'NewsItem' ] [ 'NewsComponent' ] [ 'NewsComponent' ] [ 'NewsComponent' ] [ 'ContentItem' ] [ 'DataContent' ] [ 'nitf' ] [ 'body' ] [ 'body.content' ] [ 'p' ]
except KeyError:
return loader. load_item( )
if not raw_content:
return loader. load_item( )
loader. add_value( 'raw_content' , raw_content)
try :
author_name = json_response[ 'NewsML' ] [ 'NewsItem' ] [ 'NewsComponent' ] [ 'NewsComponent' ] [ 'Author' ]
except KeyError:
return loader. load_item( )
if not author_name:
loader. add_value( 'author_name' , '' )
else :
loader. add_value( 'author_name' , author_name)
try :
date_time_str = json_response[ 'NewsML' ] [ 'NewsItem' ] [ 'NewsManagement' ] [ 'FirstCreated' ]
except KeyError:
return loader. load_item( )
if not date_time_str:
return loader. load_item( )
date_time_str = date_time_str. split( 'T' )
date_time_str[ 1 ] = '0' * ( 6 - len ( date_time_str[ 1 ] ) ) + date_time_str[ 1 ]
try :
published_at_wib = datetime. strptime( ' ' . join( date_time_str) , '%Y%m%d %H%M%S' ) ;
except Exception:
return loader. load_item( )
published_at = wib_to_utc( published_at_wib)
loader. add_value( 'published_at' , published_at)
return loader. load_item( )
scrapy itemloader example
def parse_item ( self, response) :
loader = ItemLoader( ChsiDaxueItem( ) , response)
loader. add_value( 'id' , response. url, re= ur'schId-(w+).dhtml' )
loader. add_value( 'url' , response. url)
loader. add_css( 'logo' , u'.r_c_sch_logo>img::attr(src)' , MapCompose( lambda url: urljoin( 'http://gaokao.chsi.com.cn/' , url) ) )
loader. add_css( 'name' , u'.topImg::text' )
loader. add_css( 'badges' , u'.r_c_sch_attr .r_c_sch_icon::attr(title)' )
data_clean = MapCompose( lambda x: re. sub( r's+' , ' ' , x) , unicode . strip)
loader. add_xpath( 'type' , u'//span[@class="f_bold" and .="?????"]/following-sibling::text()' , data_clean)
loader. add_xpath( 'membership' , u'//span[@class="f_bold" and .="?????"]/following-sibling::text()' , data_clean)
loader. add_xpath( 'province' , u'//span[@class="f_bold" and span]/following-sibling::text()' , data_clean)
loader. add_xpath( 'address' , u'//span[@class="f_bold" and .="?????"]/following-sibling::text()' , data_clean)
loader. add_xpath( 'phone' , u'//span[@class="f_bold" and .="?????"]/following-sibling::text()' , data_clean)
loader. add_xpath( 'website' , u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href' , data_clean)
loader. add_xpath( 'backdoor' , u'//span[@class="f_bold" and .="?????"]/following-sibling::text()' , data_clean)
def parse_votes ( ) :
xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
get_vote = lambda what: float ( response. xpath( xpath. format ( what) ) . extract_first( ) or 0 )
return {
'overall' : get_vote( u'?????' ) ,
'environment' : get_vote( u'???????' ) ,
'life' : get_vote( u'?????' ) ,
}
loader. add_value( 'votes' , parse_votes( ) )
def parse_trending ( ) :
css = u'{}>table tr:not(:first-child)'
def get_trending ( what) :
majors = [ ]
for e in response. css( css. format ( what) ) :
majors. append( {
'id' : e. css( u'.tdZytjTDiv>a::attr(href)' ) . re_first( r'specId=(w+)' ) ,
'name' : e. css( u'.tdZytjTDiv::attr(title)' ) . extract_first( ) ,
'vote' : float ( e. css( u'.avg_rank::text' ) . extract_first( ) ) ,
'count' : int ( e. css( u'.c_f00::text, .red::text' ) . extract_first( ) ) ,
} )
return majors
return {
'count' : get_trending( u'#topNoofPTable' ) ,
'index' : get_trending( u'#topIndexTable' ) ,
'like' : get_trending( u'.r_r_box_zymyd' ) ,
}
loader. add_value( 'trending' , parse_trending( ) )
item = loader. load_item( )
for link in LinkExtractor( restrict_xpaths= u'//ul[@id="topNav"]//a[.="????"]' ) . extract_links( response) :
yield Request( link. url, meta= { 'item' : item} , callback= self. parse_jianjie)
scrapy itemloader example
def parse_item ( self, response) :
""" Parse a response into a DocumentItem. """
doc_loader = ItemLoader( item= DocumentItem( ) , response= response)
doc_loader. add_value( 'url' , response. url)
doc_loader. add_xpath( 'meta' , '//meta[@name=' description']/@content' )
doc_loader. add_value( 'domain' , urlparse( response. url) . hostname)
doc_loader. add_xpath( 'title' , '//title/text()' )
hxs = HtmlXPathSelector( response)
links = [ ]
a_links = hxs. xpath( '//a' )
for link in a_links:
link_obj = { }
link_str = " " . join( link. xpath( '@href' ) . extract( ) )
link_obj[ 'link' ] = link_str. replace( "
", " ")
link_name_str = " " . join( link. xpath( 'text()' ) . extract( ) )
link_name_str = link_name_str. replace( "
", " ")
link_name_str = link_name_str. lstrip( )
link_name_str = link_name_str. rstrip( )
link_obj[ 'link_name' ] = link_name_str
links. append( link_obj)
doc_loader. add_value( 'links' , links)
title_list = hxs. xpath( '//title/text()' ) . extract( )
title = ' ' . join( title_list)
body_text = self. html2string( response)
text = title + " " + body_text
doc_loader. add_value( 'content' , text)
doc_loader. add_value( 'raw_text' , text)
doc_loader. add_value( 'raw_title' , title)
doc_loader. add_value( 'raw_url' , response. url)
h1_list = hxs. xpath( "//h1/text()" ) . extract( )
doc_loader. add_value( 'h1' , " " . join( h1_list) )
doc_loader. add_value( 'content_type' , response. headers[ 'Content-type' ] )
doc_loader. add_value( 'updated_on' , datetime. datetime. now( ) . strftime(
"%Y-%m-%dT%H:%M:%S" ) )
item = doc_loader. load_item( )
return item
scrapy itemloader example
def parse_item ( self, response) :
"""
Extract fields from the individual email page and load them into the
item.
@url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
@returns items 1 1
@scrapes senderName senderEmail timeSent timeReceived subject body
@scrapes replyto url
"""
load = ItemLoader( item= Email( ) , selector= response)
load. add_value( 'url' , response. url)
pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
pattern_replyto += '/a/@href'
link = response. xpath( pattern_replyto) . extract( )
link = [ '' ] if not link else link
load. add_value( 'replyto' , link[ 0 ] )
specific_fields = {
'senderName' : None ,
'senderEmail' : None ,
'timeSent' : None ,
'timeReceived' : None ,
'subject' : None
}
new_system = response. xpath( '/comment()[1][contains(., "MHonArc")]' )
if len ( new_system) >= 1 :
specific_fields = self. parse_new_system( response, specific_fields)
body_before_comment = '<!--X-Body-of-Message-->'
body_after_comment = '<!--X-Body-of-Message-End-->'
else :
specific_fields = self. parse_old_system( response, specific_fields)
body_before_comment = '<!-- body="start" -->'
body_after_comment = '<!-- body="end" -->'
for key, val in specific_fields. items( ) :
load. add_value( key, val)
if self. get_body:
pattern_body = body_before_comment + '
?( . * ) ' + body_after_comment
page_body = response. body. decode( 'utf-8' , 'ignore' )
body = re. search( pattern_body, page_body, flags= re. S)
load. add_value( 'body' , body. group( 1 ) )
return load. load_item( )
scrapy itemloader example
def parse_news ( self, response) :
self. logger. info( 'parse_news: %s' % response)
loader = ItemLoader( item= News( ) , response= response)
loader. add_value( 'url' , response. url)
title_selectors = response. css( 'h1[itemprop="headline"]::text' )
if not title_selectors:
return loader. load_item( )
title = title_selectors. extract( ) [ 0 ]
loader. add_value( 'title' , title)
author_name_selectors = response. css( 'a[rel="author"] > span::text' )
if not author_name_selectors:
loader. add_value( 'author_name' , '' )
else :
author_name = author_name_selectors. extract( ) [ 0 ]
loader. add_value( 'author_name' , author_name)
raw_content_selectors = response. css( '.content' )
if not raw_content_selectors:
return loader. load_item( )
raw_content = raw_content_selectors. extract( )
raw_content = ' ' . join( [ w. strip( ) for w in raw_content] )
raw_content = raw_content. strip( )
loader. add_value( 'raw_content' , raw_content)
date_time_str_selectors = response. css( 'article > div.time::text' )
if not date_time_str_selectors:
return loader. load_item( )
date_time_str = date_time_str_selectors. extract( ) [ 0 ]
date_time_str = date_time_str. split( ',' ) [ 1 ] . strip( ) [ : - 4 ]
date_time_str = ' ' . join( [ _( w) for w in date_time_str. split( ' ' ) ] )
try :
published_at_wib = datetime. strptime( date_time_str, '%d %B %Y - %H:%M' )
except ValueError:
return loader. load_item( )
published_at = wib_to_utc( published_at_wib)
loader. add_value( 'published_at' , published_at)
return loader. load_item( )
scrapy itemloader example
def parse_question ( self, response) :
if "QuestionHeader-title" in response. text:
match_obj = re. match ( "(.*zhihu.com/question/(d+))(/|$).*" , response. url)
if match_obj:
question_id = int ( match_obj. group( 2 ) )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_css( "title" , "h1.QuestionHeader-title::text" )
item_loader. add_css( "content" , ".QuestionHeader-detail" )
item_loader. add_value( "url" , response. url)
item_loader. add_value( "zhihu_id" , question_id)
item_loader. add_css( "answer_num" , ".List-headerText span::text" )
item_loader. add_css( "comments_num" , ".QuestionHeader-actions button::text" )
item_loader. add_css( "watch_user_num" , ".NumberBoard-value::text" )
item_loader. add_css( "topics" , ".QuestionHeader-topics .Popover div::text" )
question_item = item_loader. load_item( )
else :
match_obj = re. match ( "(.*zhihu.com/question/(d+))(/|$).*" , response. url)
if match_obj:
question_id = int ( match_obj. group( 2 ) )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_xpath( "title" , "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" )
item_loader. add_css( "content" , "#zh-question-detail" )
item_loader. add_value( "url" , response. url)
item_loader. add_value( "zhihu_id" , question_id)
item_loader. add_css( "answer_num" , "#zh-question-answer-num::text" )
item_loader. add_css( "comments_num" , "#zh-question-meta-wrap a[name='addcomment']::text" )
item_loader. add_xpath( "watch_user_num" , "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" )
item_loader. add_css( "topics" , ".zm-tag-editor-labels a::text" )
question_item = item_loader. load_item( )
yield scrapy. Request( self. start_answer_url. format ( question_id, 20 , 0 ) , headers= self. headers, callback= self. parse_answer)
yield question_item
scrapy itemloader example
def join_tags ( value) :
return ',' . join( [ i for i in value if i] )
scrapy itemloader example
def parse ( self, response) :
for country in response. css( ".col-md-4, .country" ) :
item = ItemLoader( item= CountryItem( ) , selector= country)
item. add_css( "country" , ".country-name" )
item. add_css( "capital" , ".country-capital::text" )
item. add_css( "population" , ".country-population::text" )
item. add_css( "area" , ".country-area::text" )
yield item. load_item( )
scrapy itemloader example
def parse_book ( self, response) :
book_loader = ItemLoader( item= BookItem( ) , response= response)
book_loader. default_input_processor = MapCompose( remove_tags)
book_loader. add_value( "image_urls" , response. urljoin( response. css( ".item.active > img::attr(src)" ) . extract_first( ) ) )
book_loader. add_css( "title" , ".col-sm-6.product_main > h1" , TakeFirst( ) )
book_loader. add_css( "price" , ".price_color" , TakeFirst( ) )
book_loader. add_css( "upc" , ".table.table-striped > tr:nth-child(1) > td" , TakeFirst( ) )
book_loader. add_css( "product_type" , ".table.table-striped > tr:nth-child(2) > td" , TakeFirst( ) )
book_loader. add_css( "tax" , ".table.table-striped > tr:nth-child(5) > td" , TakeFirst( ) )
book_loader. add_css( "stock" , ".table.table-striped > tr:nth-child(6) > td" , TakeFirst( ) )
book_loader. add_css( "reviews" , ".table.table-striped > tr:nth-child(7) > td" , TakeFirst( ) )
book_loader. add_css( "rating" , ".star-rating::attr(class)" , TakeFirst( ) )
return book_loader. load_item( )
scrapy itemloader example
def _extract_item ( self, response) :
l = ItemLoader( response= response, item= MyspiderItem( ) , type = 'html' )
l. add_xpath( 'movie_name' , '//h1/span[@property="v:itemreviewed"]/text()' )
l. add_xpath( 'movie_year' , '//span[@property="v:initialReleaseDate"]/text()' )
l. add_xpath( 'movie_type' , '//span[@property="v:genre"]/text()' )
l. add_xpath( 'movie_rate' , '//strong[@class="ll rating_num"]/text()' )
l. add_value( 'url' , response. url)
return dict ( l. load_item( ) )
scrapy itemloader example
def parse ( self, response) :
l = ItemLoader( item= Area( ) , response= response)
l. add_value( 'id' , parse_qs( response. xpath( '//div[@class="clearfix subnav level-1"]//li//a[2]/@href' ) . extract( ) [ 0 ] ) [ 'area_id' ] [ 0 ] )
l. add_xpath( 'name' , '//div[@class="clearfix subnav level-1"]//li//a[2]/text()' )
l. add_value( 'updated' , datetime. utcnow( ) . isoformat( ) )
return l. load_item( )
scrapy itemloader example
def parse_item ( self, response) :
url = response. url
item_idx = self. all_urls[ url]
self. logger. info( "Trying page %s %s" % ( item_idx, url) )
resp_dct = json. loads( response. body)
l = ItemLoader( item= HeatMapItem( ) , response= response)
current_hour = time. strftime( "%Y%m%d%H" , time. localtime( ) )
l. add_value( 'cur_hour' , current_hour)
l. add_value( 'serial' , item_idx)
l. add_value( 'data' , resp_dct. pop( 'data' ) )
l. add_value( 'timestamp' , resp_dct. pop( 'nt' ) )
l. add_value( 'others' , resp_dct)
l. add_value( 'url' , url)
l. add_value( 'is_parsed' , 0 )
self. finished. add( item_idx)
self. logger. info( u"Crawling %s, %s successfully. :)" % ( item_idx, url) )
self. claim_completeness( )
yield l. load_item( )
scrapy itemloader example
def parse ( self, response) :
try :
l = ItemLoader( item= MovieItem( ) , response= response)
l. add_value( 'name' ,
response. css( 'div#content h1 [property="v:itemreviewed"]::text' ) . extract_first( ) . strip( ) )
year = response. css( 'div#content h1 span.year::text' ) . extract_first( )
if year. startswith( '(' ) :
year = year[ 1 : - 1 ]
l. add_value( 'year' , year)
newStrL = [ ]
for val in response. css( 'div#info::text' ) . extract( ) :
newStr = val. strip( ) . strip( '/' )
if newStr != '' :
newStrL. append( newStr)
if len ( newStrL) == 2 :
break
if len ( newStrL) == 2 :
l. add_value( 'region' , newStrL[ 0 ] . split( '/' ) )
l. add_value( 'language' , newStrL[ 1 ] . split( '/' ) )
l. add_value( 'duration' , response. css( 'div#info [property="v:runtime"]::attr(content)' ) . extract_first( ) )
l. add_value( 'types' , response. css( 'div#info [property="v:genre"]::text' ) . extract( ) )
l. add_value( 'directors' , response. css( 'div#info [rel="v:directedBy"]::text' ) . extract( ) )
l. add_value( 'actors' , response. css( 'div#info [rel="v:starring"]::text' ) . extract( ) )
l. add_value( 'runtime' , response. css( 'div#info [property="v:initialReleaseDate"]::text' ) . extract( ) )
l. add_value( 'detailurl' , response. url)
l. add_value( 'IMDburl' , response. css( 'div#info [rel="nofollow"]::attr(href)' ) . extract( ) )
l. add_value( 'stars' , response. css( 'strong[property="v:average"]::text' ) . extract_first( ) )
return l. load_item( )
except Exception:
pass
scrapy itemloader example
def parse_question ( self, response) :
question_pattern = re. compile ( '(.*zhihu.com/question/(d+))(/|$).*' )
match_object = re. match ( question_pattern, response. url)
question_id = match_object. group( 2 )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_value( 'zhihu_id' , question_id)
item_loader. add_css( 'title' , 'h1.QuestionHeader-title::text' )
item_loader. add_css( 'topics' , '.TopicLink .Popover div::text' )
item_loader. add_value( 'url' , response. url)
item_loader. add_css( 'content' , '.QuestionHeader-detail div div span::text' )
item_loader. add_css( 'answer_num' , '.List-headerText span::text' )
item_loader. add_css( 'comments_num' , '.QuestionHeader-Comment button::text' )
item_loader. add_css( 'watch_user_num' , '.NumberBoard-value::text' )
item = item_loader. load_item( )
yield item
yield scrapy. Request( self. start_answer_url. format ( question_id= question_id, offset= 0 , limit= 20 ) ,
headers= self. headers, callback= self. parse_answer)
scrapy itemloader example
def parse ( self, response) :
for outer in response. css( '#comapreTable tr:not(:first-child)' ) :
if outer. css( 'td[align="center"]' ) :
ccode = outer. css( 'td[align="center"]>a::attr(id)' ) . extract_first( )
cname = outer. css( 'td[align="center"]>a::text' ) . extract_first( )
for inner in outer. xpath( 'td[div[@align="left"]/a]' ) :
loader = ItemLoader( item= EolZhuanyeItem( ) , selector= inner)
loader. add_value( 'ccode' , ccode)
loader. add_value( 'cname' , cname)
loader. add_css( 'url' , 'a::attr(href)' , lambda urls: urljoin( self. start_urls[ 0 ] , urls[ 0 ] ) )
loader. add_xpath( 'code' , 'following-sibling::td[1]/text()' , MapCompose( unicode . strip) )
loader. add_css( 'name' , 'a::text' , MapCompose( unicode . strip) )
item = loader. load_item( )
yield Request( url= item[ 'url' ] [ 0 ] , meta= { 'item' : item} , callback= self. parse_item)
scrapy itemloader example
def parse_first_page ( self, response) :
count = int ( response. xpath( '//ul[@class="image"]/text()' ) [ 0 ] . re( r'.*?(d+).*?' ) [ 0 ] )
title = response. request. cookies[ 'title' ]
albumURL = response. url. replace( ".shtml" , '' )
for x in xrange ( 1 , count+ 1 ) :
suffix = ".shtml"
if x > 1 :
suffix = "_" + str ( x) + ".shtml"
request = scrapy. Request( albumURL+ suffix, callback= self. parse_item, cookies= { 'title' : title} )
yield request
l = ItemLoader( item= PageItem( ) , response= response)
l. add_value( 'title' , title)
l. add_value( 'name' , self. name)
l. add_value( 'url' , response. url)
l. add_xpath( 'image_urls' , '//td[@valign="top"]/img/@src' )
yield l. load_item( )
scrapy itemloader example
def parse_first_page ( self, response) :
count = int ( response. xpath( '//div[@id="aplist"]/ul/li[1]/a/text()' ) [ 0 ] . re( r'.*?(d+).*?' ) [ 0 ] )
title = response. request. cookies[ 'title' ]
albumURL = response. url. replace( ".html" , '' )
for x in xrange ( 1 , count+ 1 ) :
suffix = ".html"
if x > 1 :
suffix = "_" + str ( x) + ".html"
request = scrapy. Request( albumURL+ suffix, callback= self. parse_item, cookies= { 'title' : title} )
yield request
l = ItemLoader( item= PageItem( ) , response= response)
l. add_value( 'title' , title)
l. add_value( 'name' , self. name)
l. add_value( 'url' , response. url)
l. add_xpath( 'image_urls' , '//p[@id="contents"]/a/img/@src' )
yield l. load_item( )
scrapy itemloader example
def _extract_item ( self, response) :
l = ItemLoader( response= response, item= MyspiderItem( ) , type = 'html' )
l. add_xpath( 'movie_name' , '//h1/span[@property="v:itemreviewed"]/text()' )
l. add_xpath( 'movie_year' , '//span[@property="v:initialReleaseDate"]/text()' )
l. add_xpath( 'movie_type' , '//span[@property="v:genre"]/text()' )
l. add_xpath( 'movie_rate' , '//strong[@class="ll rating_num"]/text()' )
l. add_value( 'url' , response. url)
return dict ( l. load_item( ) )
scrapy itemloader example
def parse_question ( self, response) :
question_id = response. meta. get( "zhihu_id" , "" )
item_loader = ItemLoader( item= ZhihuQuestionItem( ) , response= response)
item_loader. add_css( "title" , "h1.QuestionHeader-title::text" )
item_loader. add_css( "content" , ".QuestionHeader-detail" )
item_loader. add_value( "url" , response. url)
item_loader. add_value( "zhihu_id" , question_id)
item_loader. add_css( "answer_num" , ".List-headerText span::text" )
item_loader. add_css( "comments_num" , ".QuestionHeader-actions button::text" )
item_loader. add_css( "watch_user_num" , ".NumberBoard-value::text" )
item_loader. add_css( "topics" , ".QuestionHeader-topics .Popover div::text" )
question_item = item_loader. load_item( )
yield scrapy. Request( self. start_answer_url. format ( question_id, 20 , 0 ) , headers= self. headers, callback= self. parse_answer)
yield question_item
scrapy itemloader example
def parse ( self, response) :
for quote in response. css( ".quote" ) :
loader = ItemLoader( item= QuoteItem( ) , selector= quote)
loader. add_css( "text" , ".text" )
loader. add_css( "by" , ".authoor" )
loader. add_css( "tags" , ".tag" )
yield loader. load_item( )
scrapy itemloader example
def parse ( self, response) :
jsonresponse = json. loads( response. body_as_unicode( ) )
for i in range ( 0 , len ( jsonresponse[ 'data' ] [ 'list' ] ) ) :
l = ItemLoader( item = LianjiaErshouItem( ) , response= response)
house_code = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'house_code' ]
price_total = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'price_total' ]
ctime = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'ctime' ]
title = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'title' ]
frame_hall_num = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'frame_hall_num' ]
tags = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'tags' ]
house_area = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'house_area' ]
community_id = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'community_id' ]
community_name = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'community_name' ]
is_two_five = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'is_two_five' ]
frame_bedroom_num = jsonresponse[ 'data' ] [ 'list' ] [ i] [ 'frame_bedroom_num' ]
l. add_value( 'house_code' , house_code)
l. add_value( 'price_total' , price_total)
l. add_value( 'ctime' , ctime)
l. add_value( 'title' , title)
l. add_value( 'frame_hall_num' , frame_hall_num)
l. add_value( 'tags' , tags)
l. add_value( 'house_area' , house_area)
l. add_value( 'community_id' , community_id)
l. add_value( 'community_name' , community_name)
l. add_value( 'is_two_five' , is_two_five)
l. add_value( 'frame_bedroom_num' , frame_bedroom_num)
print l
yield l. load_item( )
scrapy itemloader example
def parse_item ( self, response) :
loader = ItemLoader( GaokaopaiZhuanyeItem( ) , response)
loader. add_value( 'url' , response. url)
loader. add_css( 'name' , u'.majorTitle>h1::text' )
loader. add_xpath( 'code' , u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()' , re= ur'?(.+)' )
loader. add_xpath( 'degree' , u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()' , re= ur'?(.+)' )
loader. add_xpath( 'period' , u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()' , re= ur'?(.+)' )
loader. add_xpath( 'courses' , u'//div[@class="course"]/h3[.="?????"]/following-sibling::p/text()' )
def parse_related ( ) :
for e in response. xpath( u'//div[@class="course"]/h3[.="?????"]/following-sibling::a' ) :
yield {
'url' : e. css( '::attr(href)' ) . extract_first( ) ,
'code' : e. css( '::attr(href)' ) . re_first( ur'-([^-]+).html' ) ,
'name' : e. css( '::text' ) . extract_first( ) ,
}
loader. add_value( 'related' , list ( parse_related( ) ) )
def parse_category ( ) :
category = [ ]
for i in [ u"????" , u"????" , u"????" ] :
x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a' . format ( i)
e = response. xpath( x)
category. append( {
'url' : e. css( '::attr(href)' ) . extract_first( ) ,
'code' : e. css( '::attr(href)' ) . re_first( ur'/zhuanye([-0-9]*).html' ) . strip( '-' ) ,
'name' : e. css( '::text' ) . extract_first( ) ,
} )
return category
loader. add_value( 'category' , parse_category( ) )
loader. add_css( 'detail' , u'.majorCon' )
item = loader. load_item( )
return Request(
url= 'http://www.gaokaopai.com/zhuanye-jiuye-{}.html' . format ( item[ 'code' ] [ 0 ] ) ,
meta= { 'item' : item} ,
callback= self. parse_jiuye
)
© 2022 Copyright:
DekGenius.com