Last active 1450591738

RSS feed rewriter

Steven Smith revised this gist 1375732306. Go to revision

1 file changed, 104 insertions

rssfeedrewrite.py(file created)

@@ -0,0 +1,104 @@
1 + # RSS feed rewriter
2 + # For adding more content from the target page that isn't included normally
3 + # For an example output, compare
4 + # http://www.escapistmagazine.com/rss/videos/list/1.xml
5 + # and
6 + # http://irc.lazle.co/xml/zeropunctuation.xml
7 + # I added Escapist's embed code loaded from their site in the item description.
8 + # A maybe better example is
9 + # http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
10 + # and
11 + # http://irc.lazle.co/xml/extrapunctuation.xml
12 + # I wanted the whole article text in my feed reader :D
13 + # Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
14 + # (from a much earlier version of the script. fully functional, but there are
15 + # differences between them)
16 +
17 + # Future plans:
18 + # Need to make this more accessible for others
19 + # Clean up
20 +
21 + # Copyright 2013 Steven Smith (blha303). All Rights Reserved.
22 + # New BSD license
23 + # http://www.opensource.org/licenses/BSD-3-Clause
24 +
25 + import yql # I like how simple feed importing is with yql
26 + import sys
27 + from time import sleep
28 + from urllib import urlopen
29 + from bs4 import BeautifulSoup as Soup
30 +
31 + yql_env = "http://datatables.org/alltables.env"
32 +
33 + YQL = yql.Public()
34 +
35 +
36 + def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
37 + filename = "" # Location output of this file is being read to. used for checking for feed updates
38 + # Copy in the top of the original XML file if you're not sure.
39 + base = """<?xml version="1.0" encoding="ISO-8859-1"?>
40 + <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
41 + <channel>
42 + <title>{title}</title>
43 + <link>{link}</link>
44 + <description><![CDATA[{description}]]></description>
45 + <language>en-us</language>
46 + <docs>http://blogs.law.harvard.edu/tech/rss</docs>
47 + """.format(title=title, link=link, description=desc)
48 +
49 + end = """
50 + </channel>
51 + </rss>"""
52 +
53 + if filename != "":
54 + with open(filename, "r") as f:
55 + olddata = f.read()
56 + oldsoup = Soup(olddata)
57 + else:
58 + oldsoup = None
59 +
60 + query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
61 + result = YQL.execute(query, {"feed": feedurl}, env=yql_env)
62 +
63 + if not result.rows:
64 + return "No response?"
65 + if oldsoup:
66 + if oldsoup.find('item'):
67 + if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
68 + # No new articles
69 + if debug: print "No new articles"
70 + if aprint: print str(olddata)
71 + return str(olddata)
72 +
73 + if debug: print result.rows
74 + if aprint: print base
75 + items = []
76 + for row in result.rows:
77 + if debug: print "Description for " + row["title"]
78 + description = row['description'] # use beautifulsoup or something to retrieve the info you want
79 + # Add more tags below if needed in the dest feed
80 + items.append("""
81 + <item>
82 + <title>{title}</title>
83 + <link>{url}</link>
84 + <guid>{url}</guid>
85 + <description><![CDATA[{description}]]></description>
86 + <pubDate>{date}</pubDate>
87 + <category>{category}</category>
88 + </item>""".format(title=row["title"].replace("&", "&amp;"), url=row["link"].split("?")[0],
89 + description=description.replace("&", "&amp;"), date=row["pubDate"],
90 + category=row["category"]))
91 + if debug: print "Finished " + title
92 + if aprint: print items[-1]
93 + sleep(1) # to avoid hammering the site in description lookups
94 + if aprint: print end
95 + return base + "".join(items) + end
96 +
97 + if __name__ == "__main__":
98 + if len(sys.argv) > 1:
99 + if sys.argv[1] == "debug":
100 + print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
101 + elif sys.argv[1] == "aprint":
102 + main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
103 + else:
104 + print main("http://irc.lazle.co/xml/zeropunctuation.xml")
Newer Older