Search
 
SCRIPT & CODE EXAMPLE
 
CODE EXAMPLE FOR PYTHON

scrapy itemloader example

def parse_item(self, response):
        """
        Extract fields from the individual email page and load them into the
        item.

        @url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
        @returns items 1 1
        @scrapes senderName senderEmail timeSent timeReceived subject body
        @scrapes replyto url
        """

        load = ItemLoader(item=Email(), selector=response)

        # Take care of easy fields first
        load.add_value('url', response.url)

        pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
        pattern_replyto += '/a/@href'
        link = response.xpath(pattern_replyto).extract()
        link = [''] if not link else link

        load.add_value('replyto', link[0])

        # Sometime in 2003, the archive changes and the email pages
        # require specific procedure to extract the following fields:
        specific_fields = {
            'senderName': None,
            'senderEmail': None,
            'timeSent': None,
            'timeReceived': None,
            'subject': None
        }

        # Detect new archive system with HTML comment
        new_system = response.xpath('/comment()[1][contains(., "MHonArc")]')

        if len(new_system) >= 1:
            # If new archive system is detected...
            specific_fields = self.parse_new_system(response, specific_fields)
            body_before_comment = '<!--X-Body-of-Message-->'
            body_after_comment = '<!--X-Body-of-Message-End-->'
        else:
            # Otherwise...
            specific_fields = self.parse_old_system(response, specific_fields)
            body_before_comment = '<!-- body="start" -->'
            body_after_comment = '<!-- body="end" -->'

        # Load all the values from these specific fields
        for key, val in specific_fields.items():
            load.add_value(key, val)

        if self.get_body:
            # Final field, the body of the email
            pattern_body = body_before_comment + '
?(.*)' + body_after_comment

            # Ignore invalid bytes when necessary
            page_body = response.body.decode('utf-8', 'ignore')
            body = re.search(pattern_body, page_body, flags=re.S)
            load.add_value('body', body.group(1))

        return load.load_item() 
 
PREVIOUS NEXT
Tagged: #scrapy #itemloader
ADD COMMENT
Topic
Name
3+3 =