Last active 1450591738

RSS feed rewriter

rssfeedrewrite.py Raw
1# RSS feed rewriter
2# For adding more content from the target page that isn't included normally
3# For an example output, compare
4# http://www.escapistmagazine.com/rss/videos/list/1.xml
5# and
6# http://irc.lazle.co/xml/zeropunctuation.xml
7# I added Escapist's embed code loaded from their site in the item description.
8# A maybe better example is
9# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
10# and
11# http://irc.lazle.co/xml/extrapunctuation.xml
12# I wanted the whole article text in my feed reader :D
13# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
14# (from a much earlier version of the script. fully functional, but there are
15# differences between them)
16
17# Future plans:
18# Need to make this more accessible for others
19# Clean up
20
21# Copyright 2013 Steven Smith (blha303). All Rights Reserved.
22# New BSD license
23# http://www.opensource.org/licenses/BSD-3-Clause
24
25import yql # I like how simple feed importing is with yql
26import sys
27from time import sleep
28from urllib import urlopen
29from bs4 import BeautifulSoup as Soup
30
31yql_env = "http://datatables.org/alltables.env"
32
33YQL = yql.Public()
34
35
36def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
37 filename = "" # Location output of this file is being read to. used for checking for feed updates
38 # Copy in the top of the original XML file if you're not sure.
39 base = """<?xml version="1.0" encoding="ISO-8859-1"?>
40<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
41 <channel>
42 <title>{title}</title>
43 <link>{link}</link>
44 <description><![CDATA[{description}]]></description>
45 <language>en-us</language>
46 <docs>http://blogs.law.harvard.edu/tech/rss</docs>
47""".format(title=title, link=link, description=desc)
48
49 end = """
50 </channel>
51</rss>"""
52
53 if filename != "":
54 with open(filename, "r") as f:
55 olddata = f.read()
56 oldsoup = Soup(olddata)
57 else:
58 oldsoup = None
59
60 query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
61 result = YQL.execute(query, {"feed": feedurl}, env=yql_env)
62
63 if not result.rows:
64 return "No response?"
65 if oldsoup:
66 if oldsoup.find('item'):
67 if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
68 # No new articles
69 if debug: print "No new articles"
70 if aprint: print str(olddata)
71 return str(olddata)
72
73 if debug: print result.rows
74 if aprint: print base
75 items = []
76 for row in result.rows:
77 if debug: print "Description for " + row["title"]
78 description = row['description'] # use beautifulsoup or something to retrieve the info you want
79 # Add more tags below if needed in the dest feed
80 items.append("""
81 <item>
82 <title>{title}</title>
83 <link>{url}</link>
84 <guid>{url}</guid>
85 <description><![CDATA[{description}]]></description>
86 <pubDate>{date}</pubDate>
87 <category>{category}</category>
88 </item>""".format(title=row["title"].replace("&", "&amp;"), url=row["link"].split("?")[0],
89 description=description.replace("&", "&amp;"), date=row["pubDate"],
90 category=row["category"]))
91 if debug: print "Finished " + title
92 if aprint: print items[-1]
93 sleep(1) # to avoid hammering the site in description lookups
94 if aprint: print end
95 return base + "".join(items) + end
96
97if __name__ == "__main__":
98 if len(sys.argv) > 1:
99 if sys.argv[1] == "debug":
100 print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
101 elif sys.argv[1] == "aprint":
102 main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
103 else:
104 print main("http://irc.lazle.co/xml/zeropunctuation.xml")
105