Last active 1591603422

A script to parse lawtechie's post history and show, for each story that is linked to in another story, which stories link to it. Used for collecting multiple parts together. Does not show story singletons. More parsing work to do in the future

Alyssa Smith revised this gist 1591639421. Go to revision

1 file changed, 22 insertions, 8 deletions

lawtechie-story-links.py

@@ -1,10 +1,12 @@
1 1 #!/usr/bin/env python3
2 2 from requests import get
3 - from json import load,dump
3 + from json import load,dump,dumps
4 4 from collections import defaultdict
5 5 from markdown import markdown
6 6 from lxml import etree
7 + from time import sleep
7 8
9 + html = False
8 10 url = "https://www.reddit.com/search.json"
9 11 query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"}
10 12
@@ -23,12 +25,17 @@ def get_results():
23 25 data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
24 26 results += data["children"]
25 27 print("added {} results".format(data["dist"]))
28 + sleep(1)
26 29 with open("lawtechie-search-20200609.json", "w") as f:
27 30 dump(results,f)
28 31 return results
29 32
30 - def l(i):
31 - return "http://redd.it/" + i
33 + def l(i, alt=None):
34 + global html
35 + if html:
36 + return "<a href='http://redd.it/{}'>{}</a>".format(i, alt or i)
37 + else:
38 + return "http://redd.it/" + i
32 39
33 40 def parse_url(url):
34 41 if "reddit.com/r/talesfromtechsupport" in url:
@@ -37,7 +44,9 @@ def parse_url(url):
37 44 return url.split("redd.it/",1)[1][:6]
38 45
39 46
40 - def main():
47 + def main(html_param=False):
48 + global html
49 + html = html_param
41 50 results = get_results()
42 51 stories = {story["data"]["id"]: story["data"] for story in results}
43 52 story_links = defaultdict(list)
@@ -50,11 +59,16 @@ def main():
50 59 if not dest_id:
51 60 continue
52 61 dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN")
53 - story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"]))
62 + story_links[l(dest_id, alt=dest_title)].append(l(story["id"], alt=story["title"]))
54 63 for s,links in story_links.items():
55 - print("{}\n {}".format(s, "\n ".join(links)))
64 + if html:
65 + print("<dt>\n {}\n</dt><dd><ul>\n{}\n</ul></dd>".format(s, "\n".join(" <li>{}</li>".format(link) for link in links)), end="")
66 + else:
67 + print("{}\n {}".format(s, "\n ".join(links)))
68 + if html:
69 + print()
56 70 return 0
57 71
58 72 if __name__ == "__main__":
59 - from sys import exit
60 - exit(main())
73 + from sys import exit, argv
74 + exit(main(html_param="--html" in argv))

Alyssa Smith revised this gist 1591638565. Go to revision

1 file changed, 60 insertions

lawtechie-story-links.py(file created)

@@ -0,0 +1,60 @@
1 + #!/usr/bin/env python3
2 + from requests import get
3 + from json import load,dump
4 + from collections import defaultdict
5 + from markdown import markdown
6 + from lxml import etree
7 +
8 + url = "https://www.reddit.com/search.json"
9 + query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"}
10 +
11 + def get_results():
12 + try:
13 + with open("lawtechie-search-20200609.json") as f:
14 + return load(f)
15 + except:
16 + print("getting results")
17 + data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
18 + results = data["children"]
19 + print("added {} results".format(data["dist"]))
20 + while data["after"] is not None:
21 + query["after"] = data["after"]
22 + print("getting after: {}".format(data["after"]))
23 + data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
24 + results += data["children"]
25 + print("added {} results".format(data["dist"]))
26 + with open("lawtechie-search-20200609.json", "w") as f:
27 + dump(results,f)
28 + return results
29 +
30 + def l(i):
31 + return "http://redd.it/" + i
32 +
33 + def parse_url(url):
34 + if "reddit.com/r/talesfromtechsupport" in url:
35 + return url.split("/comments/",1)[1].split("/",1)[0]
36 + elif "redd.it/" in url:
37 + return url.split("redd.it/",1)[1][:6]
38 +
39 +
40 + def main():
41 + results = get_results()
42 + stories = {story["data"]["id"]: story["data"] for story in results}
43 + story_links = defaultdict(list)
44 + for story in results:
45 + story = story["data"]
46 + mkdn = "<body>" + markdown(story["selftext"]) + "</body>"
47 + doc = etree.fromstring(mkdn)
48 + for link in doc.xpath("//a"):
49 + dest_id = parse_url(link.get("href"))
50 + if not dest_id:
51 + continue
52 + dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN")
53 + story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"]))
54 + for s,links in story_links.items():
55 + print("{}\n {}".format(s, "\n ".join(links)))
56 + return 0
57 +
58 + if __name__ == "__main__":
59 + from sys import exit
60 + exit(main())
Newer Older