diff --git a/crawler.py b/crawler.py index 793af48..2992760 100644 --- a/crawler.py +++ b/crawler.py @@ -1,13 +1,36 @@ +import os import AdvancedHTMLParser +import urllib.request +import xlrd +import csv + # crawls https://www.bundestag.de/bundestag/plenum/abstimmung/2016 for votes in xls format # should then evaluate and store the results in a database parser = AdvancedHTMLParser.AdvancedHTMLParser(); -parser.parseFile("bundestag.html") +url = 'https://www.bundestag.de/bundestag/plenum/abstimmung/2016' +base_url = 'https://www.bundestag.de' +content = urllib.request.urlopen(url).read().decode('utf-8') + +parser.parseStr(content) links = parser.getElementsByClassName("linkGeneric") for link in links: href = link.getAttribute("href") if href.endswith(".xls"): - print href + filenameIndex = href.rfind('/') + 1 + xlsFilename = href[filenameIndex:] + filename = xlsFilename[:-4] + csvFilename = filename + '.csv' + urllib.request.urlretrieve(base_url + href, xlsFilename) + workbook = xlrd.open_workbook(xlsFilename) + worksheet = workbook.sheet_by_index(0) + csvFile = open(csvFilename, 'w', newline='') + writer = csv.writer(csvFile, quoting=csv.QUOTE_ALL) + + for rownum in range(worksheet.nrows): + writer.writerow(worksheet.row_values(rownum)) + + csvFile.close() + os.remove(xlsFilename)