Difference between revisions of "Moroccan Parliament Web Crawler"
Peterjalbert (talk | contribs) |
Peterjalbert (talk | contribs) |
||
Line 28: | Line 28: | ||
[http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%85%D8%B4%D8%A7%D8%B1%D9%8A%D8%B9-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86 Monarchy Proposed Bills] | [http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%85%D8%B4%D8%A7%D8%B1%D9%8A%D8%B9-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86 Monarchy Proposed Bills] | ||
+ | |||
+ | The data that needs to be extracted from this site includes the pdfs of all the bill pages, as well as any interior pdfs on each page. The bill pages should be named by their url, and the interior pdfs should be named by their respective bill numbers. | ||
===Moroccan House of Representatives Proposed Bills=== | ===Moroccan House of Representatives Proposed Bills=== | ||
[http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%84%D8%A7%D8%A6%D8%AD%D8%A9-%D9%85%D9%82%D8%AA%D8%B1%D8%AD%D8%A7%D8%AA-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86 House Proposed Bills] | [http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%84%D8%A7%D8%A6%D8%AD%D8%A9-%D9%85%D9%82%D8%AA%D8%B1%D8%AD%D8%A7%D8%AA-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86 House Proposed Bills] | ||
+ | |||
+ | See Monarchy proposed bills for instructions. | ||
===Moroccan Legislature Ratified Bills=== | ===Moroccan Legislature Ratified Bills=== | ||
[http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D8%A7%D9%84%D9%86%D8%B5%D9%88%D8%B5-%D8%A7%D9%84%D8%AA%D9%8A-%D8%B5%D8%A7%D8%AF%D9%82-%D8%B9%D9%84%D9%8A%D9%87%D8%A7-%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D9%86%D9%88%D8%A7%D8%A8?field_legislature_tid=All&field_nature_loi_tid=All&page=27 Ratified Bills] | [http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D8%A7%D9%84%D9%86%D8%B5%D9%88%D8%B5-%D8%A7%D9%84%D8%AA%D9%8A-%D8%B5%D8%A7%D8%AF%D9%82-%D8%B9%D9%84%D9%8A%D9%87%D8%A7-%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D9%86%D9%88%D8%A7%D8%A8?field_legislature_tid=All&field_nature_loi_tid=All&page=27 Ratified Bills] | ||
+ | |||
+ | See Monarchy proposed bills for instructions. | ||
===Moroccan Legislature Oral Questions=== | ===Moroccan Legislature Oral Questions=== | ||
Line 44: | Line 50: | ||
[http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%A3%D8%B3%D9%80%D8%A6%D9%84%D8%A9-%D8%A7%D9%84%D9%83%D8%AA%D8%A7%D8%A8%D9%8A%D8%A9 Written Questions] | [http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%A3%D8%B3%D9%80%D8%A6%D9%84%D8%A9-%D8%A7%D9%84%D9%83%D8%AA%D8%A7%D8%A8%D9%8A%D8%A9 Written Questions] | ||
+ | |||
+ | |||
+ | ===Example Code=== | ||
+ | |||
+ | #General Bill Download | ||
+ | |||
+ | from selenium import webdriver | ||
+ | from selenium.webdriver.common.action_chains import ActionChains | ||
+ | from selenium.webdriver.common.keys import Keys | ||
+ | import time | ||
+ | import urllib | ||
+ | import string | ||
+ | import re | ||
+ | |||
+ | #launch Google Chrome Browser | ||
+ | driver = webdriver.Chrome() | ||
+ | |||
+ | def switch_window(): | ||
+ | handles = driver.window_handles | ||
+ | driver.switch_to_window(handles[-1]) | ||
+ | |||
+ | |||
+ | |||
+ | #Visit desired website | ||
+ | driver.get('http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%84%D8%A7%D8%A6%D8%AD%D8%A9- %D9%85%D9%82%D8%AA%D8%B1%D8%AD%D8%A7%D8%AA-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86?body_value=&field_og_commission_target_id=All') | ||
+ | |||
+ | bills_list = driver.find_elements_by_xpath("//li/h3/a") | ||
+ | for i in range(len(bills_list)): | ||
+ | ActionChains(driver).key_down(Keys.SHIFT).perform() | ||
+ | bills_list[i].click() | ||
+ | ActionChains(driver).key_up(Keys.SHIFT).perform() | ||
+ | |||
+ | switch_window() | ||
+ | url = driver.current_url | ||
+ | unicode_url = urllib.unquote(str(url)).decode('utf8') | ||
+ | |||
+ | url_parts = string.split(unicode_url, "/") | ||
+ | i = len(url_parts) | ||
+ | |||
+ | |||
+ | #Build arabic tag backwards, accounting for backwards spelling | ||
+ | tag = "" | ||
+ | while i > 4: | ||
+ | tag += url_parts[i - 1] | ||
+ | i -= 1 | ||
+ | #Navigate to pdf of website | ||
+ | change_button = driver.find_elements_by_xpath("//a [@class='pdf' and @rel='nofollow']")[0] | ||
+ | ActionChains(driver).key_down(Keys.SHIFT).perform() | ||
+ | change_button.click() | ||
+ | ActionChains(driver).key_up(Keys.SHIFT).perform() | ||
+ | |||
+ | switch_window() | ||
+ | |||
+ | #Gets current window's URL | ||
+ | url = driver.current_url | ||
+ | |||
+ | #Saves file at URL to current directory | ||
+ | urllib.urlretrieve(url, tag) | ||
+ | |||
+ | |||
+ | driver.close() | ||
+ | |||
+ | switch_window() | ||
+ | |||
+ | |||
+ | pdfs_on_page = driver.find_elements_by_xpath("//div/div/div/article/div/ul/li/a") | ||
+ | |||
+ | #finds interior pdfs on the page | ||
+ | if pdfs_on_page: | ||
+ | for j in range(len(pdfs_on_page)): | ||
+ | element = pdfs_on_page[j] | ||
+ | |||
+ | #click on pdf | ||
+ | ActionChains(driver).key_down(Keys.SHIFT).perform() | ||
+ | element.click() | ||
+ | ActionChains(driver).key_up(Keys.SHIFT).perform() | ||
+ | |||
+ | switch_window() | ||
+ | |||
+ | url = driver.current_url | ||
+ | pdf_tag = string.split(str(url), "/")[-1] | ||
+ | |||
+ | #leaves link if it is not a pdf | ||
+ | if re.findall(".pdf", pdf_tag): | ||
+ | |||
+ | #saves interior pdf | ||
+ | urllib.urlretrieve(url, pdf_tag) | ||
+ | |||
+ | |||
+ | driver.close() | ||
+ | |||
+ | switch_window() | ||
+ | |||
+ | driver.close() | ||
+ | |||
+ | switch_window() | ||
+ | |||
+ | |||
+ | print "download complete" | ||
+ | |||
+ | #close browser | ||
+ | driver.quit() | ||
===Further Inquiries=== | ===Further Inquiries=== |
Revision as of 13:42, 13 October 2016
Moroccan Parliament Web Crawler | |
---|---|
Project Information | |
Project Title | |
Start Date | |
Deadline | |
Primary Billing | |
Notes | |
Has project status | |
Copyright © 2016 edegan.com. All Rights Reserved. |
Contents
Overview
This web driver is designed to save information from the Moroccan Legislature website as a pdf before the information is removed from the website due to lack of space.
On the right hand side of the website, the final bullet is a link to archived bills.
On the right hand side of the website, one bullet above the last header, is a link to links of all the proposed bills. When clicked, the user is directed to the most recent batch of links of proposed bills. The movement button on the bottom left in arabic means, "go to last page", and the second from the left means "previous page". When "go to last page" is clicked, the batch of links to proposed bills that are about to be removed from the website are listed with the oldest link at the bottom of the list.
When any proposed bill link is clicked, it opens up a page with information about the bill, and about discussions in parliament. There are up to two hyperlinks on this page, the first of which is a pdf of the original bill. If there is a second link further below, it contains a pdf of the Committee Report for the bill (if it was sent to a Committee).
The two pdfs, as well as the webpage of the proposed bill may be subject to record keeping.
Driving Opportunities
Five websites within the Moroccan Legislature site have data that needs to be recorded.
Moroccan Monarchy Proposed Bills
The data that needs to be extracted from this site includes the pdfs of all the bill pages, as well as any interior pdfs on each page. The bill pages should be named by their url, and the interior pdfs should be named by their respective bill numbers.
Moroccan House of Representatives Proposed Bills
See Monarchy proposed bills for instructions.
Moroccan Legislature Ratified Bills
See Monarchy proposed bills for instructions.
Moroccan Legislature Oral Questions
Moroccan Legislature Written Questions
Example Code
#General Bill Download
from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys import time import urllib import string import re
#launch Google Chrome Browser driver = webdriver.Chrome()
def switch_window(): handles = driver.window_handles driver.switch_to_window(handles[-1])
#Visit desired website driver.get('http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%84%D8%A7%D8%A6%D8%AD%D8%A9- %D9%85%D9%82%D8%AA%D8%B1%D8%AD%D8%A7%D8%AA-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86?body_value=&field_og_commission_target_id=All')
bills_list = driver.find_elements_by_xpath("//li/h3/a") for i in range(len(bills_list)): ActionChains(driver).key_down(Keys.SHIFT).perform() bills_list[i].click() ActionChains(driver).key_up(Keys.SHIFT).perform()
switch_window() url = driver.current_url unicode_url = urllib.unquote(str(url)).decode('utf8') url_parts = string.split(unicode_url, "/") i = len(url_parts)
#Build arabic tag backwards, accounting for backwards spelling tag = "" while i > 4: tag += url_parts[i - 1] i -= 1 #Navigate to pdf of website change_button = driver.find_elements_by_xpath("//a [@class='pdf' and @rel='nofollow']")[0] ActionChains(driver).key_down(Keys.SHIFT).perform() change_button.click() ActionChains(driver).key_up(Keys.SHIFT).perform()
switch_window()
#Gets current window's URL url = driver.current_url
#Saves file at URL to current directory urllib.urlretrieve(url, tag)
driver.close()
switch_window() pdfs_on_page = driver.find_elements_by_xpath("//div/div/div/article/div/ul/li/a")
#finds interior pdfs on the page if pdfs_on_page: for j in range(len(pdfs_on_page)): element = pdfs_on_page[j]
#click on pdf ActionChains(driver).key_down(Keys.SHIFT).perform() element.click() ActionChains(driver).key_up(Keys.SHIFT).perform()
switch_window()
url = driver.current_url pdf_tag = string.split(str(url), "/")[-1]
#leaves link if it is not a pdf if re.findall(".pdf", pdf_tag):
#saves interior pdf urllib.urlretrieve(url, pdf_tag) driver.close()
switch_window() driver.close()
switch_window()
print "download complete"
#close browser driver.quit()
Further Inquiries
Further inquiries have been requested for the Kuwait, Tunisian, and Algerian Parliaments.
Kuwait Parliament
Data to download (Tables and PDF files): Monarchy - Proposed Bills (مشروع بقانون); For 13th and 14th terms. Parliament - Proposed Bills (اقتراح بقانون); for 14th term: Proposals (اقتراح بــرغــبـــة): For 13th term and 14th terms Meeting Agendas/Minutes (جدول اعمال الجلسه); for 14th term
Tunisian Parliament
Data to Download(save web pages as PDFs and documents as PDFs):
Bills proposed to parliament(مشروع قانون معروض على المجلس): click here Bills proposed to legislative committees (مشروع قانون معروض على اللجان): click here General Assembly Deliberations (الجلسات العامة): click here
Algerian Parliament
Data to download: Bills that have been voted on (القوانين المصوت عليها الفترة التشريعية السابعة : 2012 - 2017): click here Parliamentary initiatives/questions (المبادرات البرلمانية): click here