diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..c535463 --- /dev/null +++ b/crawler.py @@ -0,0 +1,10 @@ +import AdvancedHTMLParser + +parser = AdvancedHTMLParser.AdvancedHTMLParser(); + +parser.parseFile("bundestag.html") +links = parser.getElementsByClassName("linkGeneric") +for link in links: + href = link.getAttribute("href") + if href.endswith(".xls"): + print href