diff --git a/crawler.py b/crawler.py index 4362f81..5ff9ae2 100644 --- a/crawler.py +++ b/crawler.py @@ -16,9 +16,13 @@ parser.parseStr(content) links = parser.getElementsByClassName("linkGeneric") +titles = parser.getElementsByClassName("standardLinkliste").getElementsByTagName("strong") +idx = 0 for link in links: href = link.getAttribute("href") if href.endswith(".xls"): + title = titles[idx].innerHTML + idx += 1 filenameIndex = href.rfind('/') + 1 xlsFilename = href[filenameIndex:] filename = xlsFilename[:-4]