Revision of lawtechie-story-links.py

Alyssa Smith revised this gist 1591639421. Go to revision

1 file changed, 22 insertions, 8 deletions

lawtechie-story-links.py

			@@ -1,10 +1,12 @@
1	1		#!/usr/bin/env python3
2	2		from requests import get
3		-	from json import load,dump
	3	+	from json import load,dump,dumps
4	4		from collections import defaultdict
5	5		from markdown import markdown
6	6		from lxml import etree
	7	+	from time import sleep
7	8
	9	+	html = False
8	10		url = "https://www.reddit.com/search.json"
9	11		query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"}
10	12
			@@ -23,12 +25,17 @@ def get_results():
23	25		data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
24	26		results += data["children"]
25	27		print("added {} results".format(data["dist"]))
	28	+	sleep(1)
26	29		with open("lawtechie-search-20200609.json", "w") as f:
27	30		dump(results,f)
28	31		return results
29	32
30		-	def l(i):
31		-	return "http://redd.it/" + i
	33	+	def l(i, alt=None):
	34	+	global html
	35	+	if html:
	36	+	return "<a href='http://redd.it/{}'>{}</a>".format(i, alt or i)
	37	+	else:
	38	+	return "http://redd.it/" + i
32	39
33	40		def parse_url(url):
34	41		if "reddit.com/r/talesfromtechsupport" in url:
			@@ -37,7 +44,9 @@ def parse_url(url):
37	44		return url.split("redd.it/",1)[1][:6]
38	45
39	46
40		-	def main():
	47	+	def main(html_param=False):
	48	+	global html
	49	+	html = html_param
41	50		results = get_results()
42	51		stories = {story["data"]["id"]: story["data"] for story in results}
43	52		story_links = defaultdict(list)
			@@ -50,11 +59,16 @@ def main():
50	59		if not dest_id:
51	60		continue
52	61		dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN")
53		-	story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"]))
	62	+	story_links[l(dest_id, alt=dest_title)].append(l(story["id"], alt=story["title"]))
54	63		for s,links in story_links.items():
55		-	print("{}\n {}".format(s, "\n ".join(links)))
	64	+	if html:
	65	+	print("<dt>\n {}\n</dt><dd><ul>\n{}\n</ul></dd>".format(s, "\n".join(" <li>{}</li>".format(link) for link in links)), end="")
	66	+	else:
	67	+	print("{}\n {}".format(s, "\n ".join(links)))
	68	+	if html:
	69	+	print()
56	70		return 0
57	71
58	72		if __name__ == "__main__":
59		-	from sys import exit
60		-	exit(main())
	73	+	from sys import exit, argv
	74	+	exit(main(html_param="--html" in argv))

Alyssa Smith revised this gist 1591638565. Go to revision

1 file changed, 60 insertions

lawtechie-story-links.py(file created)

		@@ -0,0 +1,60 @@
1	+	#!/usr/bin/env python3
2	+	from requests import get
3	+	from json import load,dump
4	+	from collections import defaultdict
5	+	from markdown import markdown
6	+	from lxml import etree
7	+
8	+	url = "https://www.reddit.com/search.json"
9	+	query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"}
10	+
11	+	def get_results():
12	+	try:
13	+	with open("lawtechie-search-20200609.json") as f:
14	+	return load(f)
15	+	except:
16	+	print("getting results")
17	+	data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
18	+	results = data["children"]
19	+	print("added {} results".format(data["dist"]))
20	+	while data["after"] is not None:
21	+	query["after"] = data["after"]
22	+	print("getting after: {}".format(data["after"]))
23	+	data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
24	+	results += data["children"]
25	+	print("added {} results".format(data["dist"]))
26	+	with open("lawtechie-search-20200609.json", "w") as f:
27	+	dump(results,f)
28	+	return results
29	+
30	+	def l(i):
31	+	return "http://redd.it/" + i
32	+
33	+	def parse_url(url):
34	+	if "reddit.com/r/talesfromtechsupport" in url:
35	+	return url.split("/comments/",1)[1].split("/",1)[0]
36	+	elif "redd.it/" in url:
37	+	return url.split("redd.it/",1)[1][:6]
38	+
39	+
40	+	def main():
41	+	results = get_results()
42	+	stories = {story["data"]["id"]: story["data"] for story in results}
43	+	story_links = defaultdict(list)
44	+	for story in results:
45	+	story = story["data"]
46	+	mkdn = "<body>" + markdown(story["selftext"]) + "</body>"
47	+	doc = etree.fromstring(mkdn)
48	+	for link in doc.xpath("//a"):
49	+	dest_id = parse_url(link.get("href"))
50	+	if not dest_id:
51	+	continue
52	+	dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN")
53	+	story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"]))
54	+	for s,links in story_links.items():
55	+	print("{}\n {}".format(s, "\n ".join(links)))
56	+	return 0
57	+
58	+	if __name__ == "__main__":
59	+	from sys import exit
60	+	exit(main())

Newer Older