Steven Smith revised this gist . Go to revision
1 file changed, 104 insertions
rssfeedrewrite.py(file created)
| @@ -0,0 +1,104 @@ | |||
| 1 | + | # RSS feed rewriter | |
| 2 | + | # For adding more content from the target page that isn't included normally | |
| 3 | + | # For an example output, compare | |
| 4 | + | # http://www.escapistmagazine.com/rss/videos/list/1.xml | |
| 5 | + | # and | |
| 6 | + | # http://irc.lazle.co/xml/zeropunctuation.xml | |
| 7 | + | # I added Escapist's embed code loaded from their site in the item description. | |
| 8 | + | # A maybe better example is | |
| 9 | + | # http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml | |
| 10 | + | # and | |
| 11 | + | # http://irc.lazle.co/xml/extrapunctuation.xml | |
| 12 | + | # I wanted the whole article text in my feed reader :D | |
| 13 | + | # Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904 | |
| 14 | + | # (from a much earlier version of the script. fully functional, but there are | |
| 15 | + | # differences between them) | |
| 16 | + | ||
| 17 | + | # Future plans: | |
| 18 | + | # Need to make this more accessible for others | |
| 19 | + | # Clean up | |
| 20 | + | ||
| 21 | + | # Copyright 2013 Steven Smith (blha303). All Rights Reserved. | |
| 22 | + | # New BSD license | |
| 23 | + | # http://www.opensource.org/licenses/BSD-3-Clause | |
| 24 | + | ||
| 25 | + | import yql # I like how simple feed importing is with yql | |
| 26 | + | import sys | |
| 27 | + | from time import sleep | |
| 28 | + | from urllib import urlopen | |
| 29 | + | from bs4 import BeautifulSoup as Soup | |
| 30 | + | ||
| 31 | + | yql_env = "http://datatables.org/alltables.env" | |
| 32 | + | ||
| 33 | + | YQL = yql.Public() | |
| 34 | + | ||
| 35 | + | ||
| 36 | + | def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False): | |
| 37 | + | filename = "" # Location output of this file is being read to. used for checking for feed updates | |
| 38 | + | # Copy in the top of the original XML file if you're not sure. | |
| 39 | + | base = """<?xml version="1.0" encoding="ISO-8859-1"?> | |
| 40 | + | <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" > | |
| 41 | + | <channel> | |
| 42 | + | <title>{title}</title> | |
| 43 | + | <link>{link}</link> | |
| 44 | + | <description><![CDATA[{description}]]></description> | |
| 45 | + | <language>en-us</language> | |
| 46 | + | <docs>http://blogs.law.harvard.edu/tech/rss</docs> | |
| 47 | + | """.format(title=title, link=link, description=desc) | |
| 48 | + | ||
| 49 | + | end = """ | |
| 50 | + | </channel> | |
| 51 | + | </rss>""" | |
| 52 | + | ||
| 53 | + | if filename != "": | |
| 54 | + | with open(filename, "r") as f: | |
| 55 | + | olddata = f.read() | |
| 56 | + | oldsoup = Soup(olddata) | |
| 57 | + | else: | |
| 58 | + | oldsoup = None | |
| 59 | + | ||
| 60 | + | query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed | |
| 61 | + | result = YQL.execute(query, {"feed": feedurl}, env=yql_env) | |
| 62 | + | ||
| 63 | + | if not result.rows: | |
| 64 | + | return "No response?" | |
| 65 | + | if oldsoup: | |
| 66 | + | if oldsoup.find('item'): | |
| 67 | + | if oldsoup.find('item').find('title').text == result.rows[0]["title"]: | |
| 68 | + | # No new articles | |
| 69 | + | if debug: print "No new articles" | |
| 70 | + | if aprint: print str(olddata) | |
| 71 | + | return str(olddata) | |
| 72 | + | ||
| 73 | + | if debug: print result.rows | |
| 74 | + | if aprint: print base | |
| 75 | + | items = [] | |
| 76 | + | for row in result.rows: | |
| 77 | + | if debug: print "Description for " + row["title"] | |
| 78 | + | description = row['description'] # use beautifulsoup or something to retrieve the info you want | |
| 79 | + | # Add more tags below if needed in the dest feed | |
| 80 | + | items.append(""" | |
| 81 | + | <item> | |
| 82 | + | <title>{title}</title> | |
| 83 | + | <link>{url}</link> | |
| 84 | + | <guid>{url}</guid> | |
| 85 | + | <description><![CDATA[{description}]]></description> | |
| 86 | + | <pubDate>{date}</pubDate> | |
| 87 | + | <category>{category}</category> | |
| 88 | + | </item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0], | |
| 89 | + | description=description.replace("&", "&"), date=row["pubDate"], | |
| 90 | + | category=row["category"])) | |
| 91 | + | if debug: print "Finished " + title | |
| 92 | + | if aprint: print items[-1] | |
| 93 | + | sleep(1) # to avoid hammering the site in description lookups | |
| 94 | + | if aprint: print end | |
| 95 | + | return base + "".join(items) + end | |
| 96 | + | ||
| 97 | + | if __name__ == "__main__": | |
| 98 | + | if len(sys.argv) > 1: | |
| 99 | + | if sys.argv[1] == "debug": | |
| 100 | + | print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True) | |
| 101 | + | elif sys.argv[1] == "aprint": | |
| 102 | + | main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True) | |
| 103 | + | else: | |
| 104 | + | print main("http://irc.lazle.co/xml/zeropunctuation.xml") | |
Newer
Older