lawtechie-story-links.py
· 2.6 KiB · Python
Raw
#!/usr/bin/env python3
from requests import get
from json import load,dump,dumps
from collections import defaultdict
from markdown import markdown
from lxml import etree
from time import sleep
html = False
url = "https://www.reddit.com/search.json"
query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"}
def get_results():
try:
with open("lawtechie-search-20200609.json") as f:
return load(f)
except:
print("getting results")
data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
results = data["children"]
print("added {} results".format(data["dist"]))
while data["after"] is not None:
query["after"] = data["after"]
print("getting after: {}".format(data["after"]))
data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
results += data["children"]
print("added {} results".format(data["dist"]))
sleep(1)
with open("lawtechie-search-20200609.json", "w") as f:
dump(results,f)
return results
def l(i, alt=None):
global html
if html:
return "<a href='http://redd.it/{}'>{}</a>".format(i, alt or i)
else:
return "http://redd.it/" + i
def parse_url(url):
if "reddit.com/r/talesfromtechsupport" in url:
return url.split("/comments/",1)[1].split("/",1)[0]
elif "redd.it/" in url:
return url.split("redd.it/",1)[1][:6]
def main(html_param=False):
global html
html = html_param
results = get_results()
stories = {story["data"]["id"]: story["data"] for story in results}
story_links = defaultdict(list)
for story in results:
story = story["data"]
mkdn = "<body>" + markdown(story["selftext"]) + "</body>"
doc = etree.fromstring(mkdn)
for link in doc.xpath("//a"):
dest_id = parse_url(link.get("href"))
if not dest_id:
continue
dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN")
story_links[l(dest_id, alt=dest_title)].append(l(story["id"], alt=story["title"]))
for s,links in story_links.items():
if html:
print("<dt>\n {}\n</dt><dd><ul>\n{}\n</ul></dd>".format(s, "\n".join(" <li>{}</li>".format(link) for link in links)), end="")
else:
print("{}\n {}".format(s, "\n ".join(links)))
if html:
print()
return 0
if __name__ == "__main__":
from sys import exit, argv
exit(main(html_param="--html" in argv))
| 1 | #!/usr/bin/env python3 |
| 2 | from requests import get |
| 3 | from json import load,dump,dumps |
| 4 | from collections import defaultdict |
| 5 | from markdown import markdown |
| 6 | from lxml import etree |
| 7 | from time import sleep |
| 8 | |
| 9 | html = False |
| 10 | url = "https://www.reddit.com/search.json" |
| 11 | query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"} |
| 12 | |
| 13 | def get_results(): |
| 14 | try: |
| 15 | with open("lawtechie-search-20200609.json") as f: |
| 16 | return load(f) |
| 17 | except: |
| 18 | print("getting results") |
| 19 | data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] |
| 20 | results = data["children"] |
| 21 | print("added {} results".format(data["dist"])) |
| 22 | while data["after"] is not None: |
| 23 | query["after"] = data["after"] |
| 24 | print("getting after: {}".format(data["after"])) |
| 25 | data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] |
| 26 | results += data["children"] |
| 27 | print("added {} results".format(data["dist"])) |
| 28 | sleep(1) |
| 29 | with open("lawtechie-search-20200609.json", "w") as f: |
| 30 | dump(results,f) |
| 31 | return results |
| 32 | |
| 33 | def l(i, alt=None): |
| 34 | global html |
| 35 | if html: |
| 36 | return "<a href='http://redd.it/{}'>{}</a>".format(i, alt or i) |
| 37 | else: |
| 38 | return "http://redd.it/" + i |
| 39 | |
| 40 | def parse_url(url): |
| 41 | if "reddit.com/r/talesfromtechsupport" in url: |
| 42 | return url.split("/comments/",1)[1].split("/",1)[0] |
| 43 | elif "redd.it/" in url: |
| 44 | return url.split("redd.it/",1)[1][:6] |
| 45 | |
| 46 | |
| 47 | def main(html_param=False): |
| 48 | global html |
| 49 | html = html_param |
| 50 | results = get_results() |
| 51 | stories = {story["data"]["id"]: story["data"] for story in results} |
| 52 | story_links = defaultdict(list) |
| 53 | for story in results: |
| 54 | story = story["data"] |
| 55 | mkdn = "<body>" + markdown(story["selftext"]) + "</body>" |
| 56 | doc = etree.fromstring(mkdn) |
| 57 | for link in doc.xpath("//a"): |
| 58 | dest_id = parse_url(link.get("href")) |
| 59 | if not dest_id: |
| 60 | continue |
| 61 | dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN") |
| 62 | story_links[l(dest_id, alt=dest_title)].append(l(story["id"], alt=story["title"])) |
| 63 | for s,links in story_links.items(): |
| 64 | if html: |
| 65 | print("<dt>\n {}\n</dt><dd><ul>\n{}\n</ul></dd>".format(s, "\n".join(" <li>{}</li>".format(link) for link in links)), end="") |
| 66 | else: |
| 67 | print("{}\n {}".format(s, "\n ".join(links))) |
| 68 | if html: |
| 69 | print() |
| 70 | return 0 |
| 71 | |
| 72 | if __name__ == "__main__": |
| 73 | from sys import exit, argv |
| 74 | exit(main(html_param="--html" in argv)) |