rssfeedrewrite.py

rssfeedrewrite.py · 4.0 KiB · Python Raw

# RSS feed rewriter # For adding more content from the target page that isn't included normally # For an example output, compare # http://www.escapistmagazine.com/rss/videos/list/1.xml # and # http://irc.lazle.co/xml/zeropunctuation.xml # I added Escapist's embed code loaded from their site in the item description. # A maybe better example is # http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml # and # http://irc.lazle.co/xml/extrapunctuation.xml # I wanted the whole article text in my feed reader :D # Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904 # (from a much earlier version of the script. fully functional, but there are # differences between them) # Future plans: # Need to make this more accessible for others # Clean up # Copyright 2013 Steven Smith (blha303). All Rights Reserved. # New BSD license # http://www.opensource.org/licenses/BSD-3-Clause import yql # I like how simple feed importing is with yql import sys from time import sleep from urllib import urlopen from bs4 import BeautifulSoup as Soup yql_env = "http://datatables.org/alltables.env" YQL = yql.Public() def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False): filename = "" # Location output of this file is being read to. used for checking for feed updates # Copy in the top of the original XML file if you're not sure. base = """<?xml version="1.0" encoding="ISO-8859-1"?> <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" > <channel> <title>{title}</title> <link>{link}</link> <description><![CDATA[{description}]]></description> <language>en-us</language> <docs>http://blogs.law.harvard.edu/tech/rss</docs> """.format(title=title, link=link, description=desc) end = """ </channel> </rss>""" if filename != "": with open(filename, "r") as f: olddata = f.read() oldsoup = Soup(olddata) else: oldsoup = None query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed result = YQL.execute(query, {"feed": feedurl}, env=yql_env) if not result.rows: return "No response?" if oldsoup: if oldsoup.find('item'): if oldsoup.find('item').find('title').text == result.rows[0]["title"]: # No new articles if debug: print "No new articles" if aprint: print str(olddata) return str(olddata) if debug: print result.rows if aprint: print base items = [] for row in result.rows: if debug: print "Description for " + row["title"] description = row['description'] # use beautifulsoup or something to retrieve the info you want # Add more tags below if needed in the dest feed items.append(""" <item> <title>{title}</title> <link>{url}</link> <guid>{url}</guid> <description><![CDATA[{description}]]></description> <pubDate>{date}</pubDate> <category>{category}</category> </item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0], description=description.replace("&", "&"), date=row["pubDate"], category=row["category"])) if debug: print "Finished " + title if aprint: print items[-1] sleep(1) # to avoid hammering the site in description lookups if aprint: print end return base + "".join(items) + end if __name__ == "__main__": if len(sys.argv) > 1: if sys.argv[1] == "debug": print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True) elif sys.argv[1] == "aprint": main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True) else: print main("http://irc.lazle.co/xml/zeropunctuation.xml")

1	# RSS feed rewriter
2	# For adding more content from the target page that isn't included normally
3	# For an example output, compare
4	# http://www.escapistmagazine.com/rss/videos/list/1.xml
5	# and
6	# http://irc.lazle.co/xml/zeropunctuation.xml
7	# I added Escapist's embed code loaded from their site in the item description.
8	# A maybe better example is
9	# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
10	# and
11	# http://irc.lazle.co/xml/extrapunctuation.xml
12	# I wanted the whole article text in my feed reader :D
13	# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
14	# (from a much earlier version of the script. fully functional, but there are
15	# differences between them)
16
17	# Future plans:
18	# Need to make this more accessible for others
19	# Clean up
20
21	# Copyright 2013 Steven Smith (blha303). All Rights Reserved.
22	# New BSD license
23	# http://www.opensource.org/licenses/BSD-3-Clause
24
25	import yql # I like how simple feed importing is with yql
26	import sys
27	from time import sleep
28	from urllib import urlopen
29	from bs4 import BeautifulSoup as Soup
30
31	yql_env = "http://datatables.org/alltables.env"
32
33	YQL = yql.Public()
34
35
36	def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
37	filename = "" # Location output of this file is being read to. used for checking for feed updates
38	# Copy in the top of the original XML file if you're not sure.
39	base = """<?xml version="1.0" encoding="ISO-8859-1"?>
40	<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
41	<channel>
42	<title>{title}</title>
43	<link>{link}</link>
44	<description><![CDATA[{description}]]></description>
45	<language>en-us</language>
46	<docs>http://blogs.law.harvard.edu/tech/rss</docs>
47	""".format(title=title, link=link, description=desc)
48
49	end = """
50	</channel>
51	</rss>"""
52
53	if filename != "":
54	with open(filename, "r") as f:
55	olddata = f.read()
56	oldsoup = Soup(olddata)
57	else:
58	oldsoup = None
59
60	query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
61	result = YQL.execute(query, {"feed": feedurl}, env=yql_env)
62
63	if not result.rows:
64	return "No response?"
65	if oldsoup:
66	if oldsoup.find('item'):
67	if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
68	# No new articles
69	if debug: print "No new articles"
70	if aprint: print str(olddata)
71	return str(olddata)
72
73	if debug: print result.rows
74	if aprint: print base
75	items = []
76	for row in result.rows:
77	if debug: print "Description for " + row["title"]
78	description = row['description'] # use beautifulsoup or something to retrieve the info you want
79	# Add more tags below if needed in the dest feed
80	items.append("""
81	<item>
82	<title>{title}</title>
83	<link>{url}</link>
84	<guid>{url}</guid>
85	<description><![CDATA[{description}]]></description>
86	<pubDate>{date}</pubDate>
87	<category>{category}</category>
88	</item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0],
89	description=description.replace("&", "&"), date=row["pubDate"],
90	category=row["category"]))
91	if debug: print "Finished " + title
92	if aprint: print items[-1]
93	sleep(1) # to avoid hammering the site in description lookups
94	if aprint: print end
95	return base + "".join(items) + end
96
97	if __name__ == "__main__":
98	if len(sys.argv) > 1:
99	if sys.argv[1] == "debug":
100	print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
101	elif sys.argv[1] == "aprint":
102	main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
103	else:
104	print main("http://irc.lazle.co/xml/zeropunctuation.xml")
105