diff --git a/crawler.py b/crawler.py index 5022173..e8d50d1 100755 --- a/crawler.py +++ b/crawler.py @@ -11,13 +11,13 @@ parser = AdvancedHTMLParser.AdvancedHTMLParser(); -url = 'https://www.bundestag.de/bundestag/plenum/abstimmung/2016' base_url = 'https://www.bundestag.de' +url = 'https://www.bundestag.de/ajax/filterlist/de/parlament/plenum/abstimmung/liste/-/462112' content = urllib.request.urlopen(url).read().decode('utf-8') parser.parseStr(content) -links = parser.getElementsByClassName("linkGeneric") -titles = parser.getElementsByClassName("standardLinkliste").getElementsByTagName("strong") +links = parser.getElementsByClassName("bt-link-dokument") +titles = parser.getElementsByClassName("bt-documents-description").getElementsByTagName("p").getElementsByTagName("strong") idx = 0 for link in links: href = link.getAttribute("href")