lawtechie-story-links.py
· 2.2 KiB · Python
Raw
#!/usr/bin/env python3
from requests import get
from json import load,dump
from collections import defaultdict
from markdown import markdown
from lxml import etree
url = "https://www.reddit.com/search.json"
query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"}
def get_results():
try:
with open("lawtechie-search-20200609.json") as f:
return load(f)
except:
print("getting results")
data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
results = data["children"]
print("added {} results".format(data["dist"]))
while data["after"] is not None:
query["after"] = data["after"]
print("getting after: {}".format(data["after"]))
data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"]
results += data["children"]
print("added {} results".format(data["dist"]))
with open("lawtechie-search-20200609.json", "w") as f:
dump(results,f)
return results
def l(i):
return "http://redd.it/" + i
def parse_url(url):
if "reddit.com/r/talesfromtechsupport" in url:
return url.split("/comments/",1)[1].split("/",1)[0]
elif "redd.it/" in url:
return url.split("redd.it/",1)[1][:6]
def main():
results = get_results()
stories = {story["data"]["id"]: story["data"] for story in results}
story_links = defaultdict(list)
for story in results:
story = story["data"]
mkdn = "<body>" + markdown(story["selftext"]) + "</body>"
doc = etree.fromstring(mkdn)
for link in doc.xpath("//a"):
dest_id = parse_url(link.get("href"))
if not dest_id:
continue
dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN")
story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"]))
for s,links in story_links.items():
print("{}\n {}".format(s, "\n ".join(links)))
return 0
if __name__ == "__main__":
from sys import exit
exit(main())
| 1 | #!/usr/bin/env python3 |
| 2 | from requests import get |
| 3 | from json import load,dump |
| 4 | from collections import defaultdict |
| 5 | from markdown import markdown |
| 6 | from lxml import etree |
| 7 | |
| 8 | url = "https://www.reddit.com/search.json" |
| 9 | query = {"q": "subreddit:talesfromtechsupport author:lawtechie", "sort": "new", "limit": "1000"} |
| 10 | |
| 11 | def get_results(): |
| 12 | try: |
| 13 | with open("lawtechie-search-20200609.json") as f: |
| 14 | return load(f) |
| 15 | except: |
| 16 | print("getting results") |
| 17 | data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] |
| 18 | results = data["children"] |
| 19 | print("added {} results".format(data["dist"])) |
| 20 | while data["after"] is not None: |
| 21 | query["after"] = data["after"] |
| 22 | print("getting after: {}".format(data["after"])) |
| 23 | data = get(url, params=query, headers={"User-Agent": "/u/lawtechie story directory by /u/suudo"}).json()["data"] |
| 24 | results += data["children"] |
| 25 | print("added {} results".format(data["dist"])) |
| 26 | with open("lawtechie-search-20200609.json", "w") as f: |
| 27 | dump(results,f) |
| 28 | return results |
| 29 | |
| 30 | def l(i): |
| 31 | return "http://redd.it/" + i |
| 32 | |
| 33 | def parse_url(url): |
| 34 | if "reddit.com/r/talesfromtechsupport" in url: |
| 35 | return url.split("/comments/",1)[1].split("/",1)[0] |
| 36 | elif "redd.it/" in url: |
| 37 | return url.split("redd.it/",1)[1][:6] |
| 38 | |
| 39 | |
| 40 | def main(): |
| 41 | results = get_results() |
| 42 | stories = {story["data"]["id"]: story["data"] for story in results} |
| 43 | story_links = defaultdict(list) |
| 44 | for story in results: |
| 45 | story = story["data"] |
| 46 | mkdn = "<body>" + markdown(story["selftext"]) + "</body>" |
| 47 | doc = etree.fromstring(mkdn) |
| 48 | for link in doc.xpath("//a"): |
| 49 | dest_id = parse_url(link.get("href")) |
| 50 | if not dest_id: |
| 51 | continue |
| 52 | dest_title = stories.get(dest_id, {}).get("title", "UNKNOWN") |
| 53 | story_links["{} {}".format(l(dest_id), dest_title)].append("{} {}".format(l(story["id"]), story["title"])) |
| 54 | for s,links in story_links.items(): |
| 55 | print("{}\n {}".format(s, "\n ".join(links))) |
| 56 | return 0 |
| 57 | |
| 58 | if __name__ == "__main__": |
| 59 | from sys import exit |
| 60 | exit(main()) |