def parse_item(self, response):
"""
Extract fields from the individual email page and load them into the
item.
@url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
@returns items 1 1
@scrapes senderName senderEmail timeSent timeReceived subject body
@scrapes replyto url
"""
load = ItemLoader(item=Email(), selector=response)
# Take care of easy fields first
load.add_value('url', response.url)
pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
pattern_replyto += '/a/@href'
link = response.xpath(pattern_replyto).extract()
link = [''] if not link else link
load.add_value('replyto', link[0])
# Sometime in 2003, the archive changes and the email pages
# require specific procedure to extract the following fields:
specific_fields = {
'senderName': None,
'senderEmail': None,
'timeSent': None,
'timeReceived': None,
'subject': None
}
# Detect new archive system with HTML comment
new_system = response.xpath('/comment()[1][contains(., "MHonArc")]')
if len(new_system) >= 1:
# If new archive system is detected...
specific_fields = self.parse_new_system(response, specific_fields)
body_before_comment = '<!--X-Body-of-Message-->'
body_after_comment = '<!--X-Body-of-Message-End-->'
else:
# Otherwise...
specific_fields = self.parse_old_system(response, specific_fields)
body_before_comment = '<!-- body="start" -->'
body_after_comment = '<!-- body="end" -->'
# Load all the values from these specific fields
for key, val in specific_fields.items():
load.add_value(key, val)
if self.get_body:
# Final field, the body of the email
pattern_body = body_before_comment + '
?(.*)' + body_after_comment
# Ignore invalid bytes when necessary
page_body = response.body.decode('utf-8', 'ignore')
body = re.search(pattern_body, page_body, flags=re.S)
load.add_value('body', body.group(1))
return load.load_item()