import sys import requests from hurry.filesize import size, si from bs4 import BeautifulSoup as Soup total = 0 def add_to_total(uri, BASE=""): global total headers = requests.head(BASE + uri).headers if "content-length" in headers: adding = int(headers["content-length"]) print "{}: Adding {}".format(uri, size(adding, system=si)) total += adding else: print "{}: No content-length, skipping".format(uri) def do_iterate(dir="", BASE="", COL=""): print "Entering {}{}".format(BASE, dir) for a in Soup(requests.get(BASE + dir).text).findAll('a')[COL:]: if a["href"][-1] == "/": do_iterate(dir + a["href"], BASE, COL) else: add_to_total(dir + a["href"], BASE) def main(): BASE=sys.argv[1] if len(sys.argv) > 1 else "http://file.cite.wa.edu.au/" COL=sys.argv[2] if len(sys.argv) > 2 else 3 try: do_iterate("", BASE, COL) finally: print "Total size of {}: {} ({})".format(BASE, total, size(total, system=si)) return 0 if __name__ == "__main__": sys.exit(main())