#!/usr/bin/env python3 from requests import get from json import load,dump,dumps from collections import defaultdict from markdown import markdown from lxml import etree from time import sleep html = False url = "https://www.reddit.com/search.json" query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"} def get_results(): try: with open("lawtechie-search-20200609.json") as f: return load(f) except: print("getting results") data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] results = data["children"] print("added {} results".format(data["dist"])) while data["after"] is not None: query["after"] = data["after"] print("getting after: {}".format(data["after"])) data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] results += data["children"] print("added {} results".format(data["dist"])) sleep(1) with open("lawtechie-search-20200609.json", "w") as f: dump(results,f) return results def l(i, alt=None): global html if html: return "{}".format(i, alt or i) else: return "http://redd.it/" + i def parse_url(url): if "reddit.com/r/talesfromtechsupport" in url: return url.split("/comments/",1)[1].split("/",1)[0] elif "redd.it/" in url: return url.split("redd.it/",1)[1][:6] def main(html_param=False): global html html = html_param results = get_results() stories = {story["data"]["id"]: story["data"] for story in results} story_links = defaultdict(list) for story in results: story = story["data"] mkdn = "
" + markdown(story["selftext"]) + "" doc = etree.fromstring(mkdn) for link in doc.xpath("//a"): dest_id = parse_url(link.get("href")) if not dest_id: continue dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN") story_links[l(dest_id, alt=dest_title)].append(l(story["id"], alt=story["title"])) for s,links in story_links.items(): if html: print("