Last active 1440821683

Revision 518e2443933f8634d42e116362eb7314fec4af8d

apache_index_total.py Raw
1import sys
2import requests
3from hurry.filesize import size, si
4from bs4 import BeautifulSoup as Soup
5total = 0
6
7def add_to_total(uri, BASE=""):
8 global total
9 headers = requests.head(BASE + uri).headers
10 if "content-length" in headers:
11 adding = int(headers["content-length"])
12 print "{}: Adding {}".format(uri, size(adding, system=si))
13 total += adding
14 else:
15 print "{}: No content-length, skipping".format(uri)
16
17def do_iterate(dir="", BASE="", COL=""):
18 print "Entering {}{}".format(BASE, dir)
19 for a in Soup(requests.get(BASE + dir).text).findAll('a')[COL:]:
20 if a["href"][-1] == "/":
21 do_iterate(dir + a["href"], BASE, COL)
22 else:
23 add_to_total(dir + a["href"], BASE)
24
25def main():
26 BASE=sys.argv[1] if len(sys.argv) > 1 else "http://file.cite.wa.edu.au/"
27 COL=int(sys.argv[2]) if len(sys.argv) > 2 else 3
28 try:
29 do_iterate("", BASE, COL)
30 finally:
31 print "Total size of {}: {} ({})".format(BASE, total, size(total, system=si))
32 return 0
33
34if __name__ == "__main__":
35 sys.exit(main())