#!/usr/bin/python3
import os
import AdvancedHTMLParser
import urllib.request
import xlrd
import csv
# crawls https://www.bundestag.de/bundestag/plenum/abstimmung/2016 for votes in xls format
# should then evaluate and store the results in a database
parser = AdvancedHTMLParser.AdvancedHTMLParser();
url = 'https://www.bundestag.de/bundestag/plenum/abstimmung/2016'
base_url = 'https://www.bundestag.de'
content = urllib.request.urlopen(url).read().decode('utf-8')
parser.parseStr(content)
links = parser.getElementsByClassName("linkGeneric")
titles = parser.getElementsByClassName("standardLinkliste").getElementsByTagName("strong")
idx = 0
for link in links:
href = link.getAttribute("href")
if href.endswith(".xls"):
title = titles[idx].innerHTML
idx += 1
filenameIndex = href.rfind('/') + 1
xlsFilename = href[filenameIndex:]
filename = xlsFilename[:-4]
csvFilename = title.replace('/', '') + '.csv'
urllib.request.urlretrieve(base_url + href, xlsFilename)
workbook = xlrd.open_workbook(xlsFilename)
worksheet = workbook.sheet_by_index(0)
csvFile = open(csvFilename, 'w', newline='', encoding='utf-8')
writer = csv.writer(csvFile, quoting=csv.QUOTE_ALL)
for rownum in range(worksheet.nrows):
writer.writerow(worksheet.row_values(rownum))
csvFile.close()
os.remove(xlsFilename)