Alyssa Smith revised this gist . Go to revision
1 file changed, 22 insertions, 8 deletions
lawtechie-story-links.py
| @@ -1,10 +1,12 @@ | |||
| 1 | 1 | #!/usr/bin/env python3 | |
| 2 | 2 | from requests import get | |
| 3 | - | from json import load,dump | |
| 3 | + | from json import load,dump,dumps | |
| 4 | 4 | from collections import defaultdict | |
| 5 | 5 | from markdown import markdown | |
| 6 | 6 | from lxml import etree | |
| 7 | + | from time import sleep | |
| 7 | 8 | ||
| 9 | + | html = False | |
| 8 | 10 | url = "https://www.reddit.com/search.json" | |
| 9 | 11 | query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"} | |
| 10 | 12 | ||
| @@ -23,12 +25,17 @@ def get_results(): | |||
| 23 | 25 | data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] | |
| 24 | 26 | results += data["children"] | |
| 25 | 27 | print("added {} results".format(data["dist"])) | |
| 28 | + | sleep(1) | |
| 26 | 29 | with open("lawtechie-search-20200609.json", "w") as f: | |
| 27 | 30 | dump(results,f) | |
| 28 | 31 | return results | |
| 29 | 32 | ||
| 30 | - | def l(i): | |
| 31 | - | return "http://redd.it/" + i | |
| 33 | + | def l(i, alt=None): | |
| 34 | + | global html | |
| 35 | + | if html: | |
| 36 | + | return "<a href='http://redd.it/{}'>{}</a>".format(i, alt or i) | |
| 37 | + | else: | |
| 38 | + | return "http://redd.it/" + i | |
| 32 | 39 | ||
| 33 | 40 | def parse_url(url): | |
| 34 | 41 | if "reddit.com/r/talesfromtechsupport" in url: | |
| @@ -37,7 +44,9 @@ def parse_url(url): | |||
| 37 | 44 | return url.split("redd.it/",1)[1][:6] | |
| 38 | 45 | ||
| 39 | 46 | ||
| 40 | - | def main(): | |
| 47 | + | def main(html_param=False): | |
| 48 | + | global html | |
| 49 | + | html = html_param | |
| 41 | 50 | results = get_results() | |
| 42 | 51 | stories = {story["data"]["id"]: story["data"] for story in results} | |
| 43 | 52 | story_links = defaultdict(list) | |
| @@ -50,11 +59,16 @@ def main(): | |||
| 50 | 59 | if not dest_id: | |
| 51 | 60 | continue | |
| 52 | 61 | dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN") | |
| 53 | - | story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"])) | |
| 62 | + | story_links[l(dest_id, alt=dest_title)].append(l(story["id"], alt=story["title"])) | |
| 54 | 63 | for s,links in story_links.items(): | |
| 55 | - | print("{}\n {}".format(s, "\n ".join(links))) | |
| 64 | + | if html: | |
| 65 | + | print("<dt>\n {}\n</dt><dd><ul>\n{}\n</ul></dd>".format(s, "\n".join(" <li>{}</li>".format(link) for link in links)), end="") | |
| 66 | + | else: | |
| 67 | + | print("{}\n {}".format(s, "\n ".join(links))) | |
| 68 | + | if html: | |
| 69 | + | print() | |
| 56 | 70 | return 0 | |
| 57 | 71 | ||
| 58 | 72 | if __name__ == "__main__": | |
| 59 | - | from sys import exit | |
| 60 | - | exit(main()) | |
| 73 | + | from sys import exit, argv | |
| 74 | + | exit(main(html_param="--html" in argv)) | |
Alyssa Smith revised this gist . Go to revision
1 file changed, 60 insertions
lawtechie-story-links.py(file created)
| @@ -0,0 +1,60 @@ | |||
| 1 | + | #!/usr/bin/env python3 | |
| 2 | + | from requests import get | |
| 3 | + | from json import load,dump | |
| 4 | + | from collections import defaultdict | |
| 5 | + | from markdown import markdown | |
| 6 | + | from lxml import etree | |
| 7 | + | ||
| 8 | + | url = "https://www.reddit.com/search.json" | |
| 9 | + | query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"} | |
| 10 | + | ||
| 11 | + | def get_results(): | |
| 12 | + | try: | |
| 13 | + | with open("lawtechie-search-20200609.json") as f: | |
| 14 | + | return load(f) | |
| 15 | + | except: | |
| 16 | + | print("getting results") | |
| 17 | + | data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] | |
| 18 | + | results = data["children"] | |
| 19 | + | print("added {} results".format(data["dist"])) | |
| 20 | + | while data["after"] is not None: | |
| 21 | + | query["after"] = data["after"] | |
| 22 | + | print("getting after: {}".format(data["after"])) | |
| 23 | + | data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] | |
| 24 | + | results += data["children"] | |
| 25 | + | print("added {} results".format(data["dist"])) | |
| 26 | + | with open("lawtechie-search-20200609.json", "w") as f: | |
| 27 | + | dump(results,f) | |
| 28 | + | return results | |
| 29 | + | ||
| 30 | + | def l(i): | |
| 31 | + | return "http://redd.it/" + i | |
| 32 | + | ||
| 33 | + | def parse_url(url): | |
| 34 | + | if "reddit.com/r/talesfromtechsupport" in url: | |
| 35 | + | return url.split("/comments/",1)[1].split("/",1)[0] | |
| 36 | + | elif "redd.it/" in url: | |
| 37 | + | return url.split("redd.it/",1)[1][:6] | |
| 38 | + | ||
| 39 | + | ||
| 40 | + | def main(): | |
| 41 | + | results = get_results() | |
| 42 | + | stories = {story["data"]["id"]: story["data"] for story in results} | |
| 43 | + | story_links = defaultdict(list) | |
| 44 | + | for story in results: | |
| 45 | + | story = story["data"] | |
| 46 | + | mkdn = "<body>" + markdown(story["selftext"]) + "</body>" | |
| 47 | + | doc = etree.fromstring(mkdn) | |
| 48 | + | for link in doc.xpath("//a"): | |
| 49 | + | dest_id = parse_url(link.get("href")) | |
| 50 | + | if not dest_id: | |
| 51 | + | continue | |
| 52 | + | dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN") | |
| 53 | + | story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"])) | |
| 54 | + | for s,links in story_links.items(): | |
| 55 | + | print("{}\n {}".format(s, "\n ".join(links))) | |
| 56 | + | return 0 | |
| 57 | + | ||
| 58 | + | if __name__ == "__main__": | |
| 59 | + | from sys import exit | |
| 60 | + | exit(main()) | |