Changes

Jump to navigation Jump to search
no edit summary
Some example code for a spider is shown below; this was my spider for the oral questions portion of the Moroccan site.
import scrapy
import string
class MySpider(scrapy.Spider):
name = "oral"
page_range = 375
start_urls = (["http://www.chambredesrepresentants.ma/ar/%D9%85%D8%B1%D8%A7%D9%82%D8%A8%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D9%84-
%D8%A7%D9%84%D8%AD%D9%83%D9%88%D9%85%D9%8A/%D8%A7%D9%84%D8%A3%D8%B3%D9%80%D8%A6%D9%84%D8%A9%D8%A7%D9%84%D8%B4%D9%81%D9%88%D9%8A%D8%A9?
field_ministeres_tid=All&field_groupe_concerne_target_id=All&field_parlementaires_associes_target_id=All&body_value=&field_transfere_ou_non_value=All"] +
["http://www.chambredesrepresentants.ma/ar/%D9%85%D8%B1%D8%A7%D9%82%D8%A8%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D9%84
%D8%A7%D9%84%D8%AD%D9%83%D9%88%D9%85%D9%8A/%D8%A7%D9%84%D8%A3%D8%B3%D9%80%D8%A6%D9%84%D8%A9-
%D8%A7%D9%84%D8%B4%D9%81%D9%88%D9%8A%D8%A9field_ministeres_tid=All&field_groupe_concerne_target_id=All&field_parlementaires_associes_target_id=All&body_value=&field_transfere_
ou_non_value=All&page=" + str(num) for num in range(page_range)])
def parse(self, response):
a = response.css('ul.listing_questions')
for header in a.css('li'):
date = header.css('h3.sorting_date::text').extract_first()
if date != None:
placeholder = date
if date == None:
date = placeholder
question = header.css('a::text').extract_first()
info = header.css('div.questionss_group::text').extract()
info = ''.join(elem for elem in info)
info_split = string.split(info, "\n")
info1 = info_split[2]
info1 = ' '.join(info1.split())
info2 = info_split[4]
info2 = ' '.join(info2.split())
info3 = info_split[5]
info3 = ' '.join(info3.split())
yield {
'date': date,
'info1':info1,
'info2': info2,
'info3': info3,
'question': question,
'url': response.url
}

Navigation menu