rssfeedrewrite.py
· 4.0 KiB · Python
Raw
# RSS feed rewriter
# For adding more content from the target page that isn't included normally
# For an example output, compare
# http://www.escapistmagazine.com/rss/videos/list/1.xml
# and
# http://irc.lazle.co/xml/zeropunctuation.xml
# I added Escapist's embed code loaded from their site in the item description.
# A maybe better example is
# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
# and
# http://irc.lazle.co/xml/extrapunctuation.xml
# I wanted the whole article text in my feed reader :D
# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
# (from a much earlier version of the script. fully functional, but there are
# differences between them)
# Future plans:
# Need to make this more accessible for others
# Clean up
# Copyright 2013 Steven Smith (blha303). All Rights Reserved.
# New BSD license
# http://www.opensource.org/licenses/BSD-3-Clause
import yql # I like how simple feed importing is with yql
import sys
from time import sleep
from urllib import urlopen
from bs4 import BeautifulSoup as Soup
yql_env = "http://datatables.org/alltables.env"
YQL = yql.Public()
def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
filename = "" # Location output of this file is being read to. used for checking for feed updates
# Copy in the top of the original XML file if you're not sure.
base = """<?xml version="1.0" encoding="ISO-8859-1"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
<channel>
<title>{title}</title>
<link>{link}</link>
<description><![CDATA[{description}]]></description>
<language>en-us</language>
<docs>http://blogs.law.harvard.edu/tech/rss</docs>
""".format(title=title, link=link, description=desc)
end = """
</channel>
</rss>"""
if filename != "":
with open(filename, "r") as f:
olddata = f.read()
oldsoup = Soup(olddata)
else:
oldsoup = None
query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
result = YQL.execute(query, {"feed": feedurl}, env=yql_env)
if not result.rows:
return "No response?"
if oldsoup:
if oldsoup.find('item'):
if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
# No new articles
if debug: print "No new articles"
if aprint: print str(olddata)
return str(olddata)
if debug: print result.rows
if aprint: print base
items = []
for row in result.rows:
if debug: print "Description for " + row["title"]
description = row['description'] # use beautifulsoup or something to retrieve the info you want
# Add more tags below if needed in the dest feed
items.append("""
<item>
<title>{title}</title>
<link>{url}</link>
<guid>{url}</guid>
<description><![CDATA[{description}]]></description>
<pubDate>{date}</pubDate>
<category>{category}</category>
</item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0],
description=description.replace("&", "&"), date=row["pubDate"],
category=row["category"]))
if debug: print "Finished " + title
if aprint: print items[-1]
sleep(1) # to avoid hammering the site in description lookups
if aprint: print end
return base + "".join(items) + end
if __name__ == "__main__":
if len(sys.argv) > 1:
if sys.argv[1] == "debug":
print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
elif sys.argv[1] == "aprint":
main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
else:
print main("http://irc.lazle.co/xml/zeropunctuation.xml")
| 1 | # RSS feed rewriter |
| 2 | # For adding more content from the target page that isn't included normally |
| 3 | # For an example output, compare |
| 4 | # http://www.escapistmagazine.com/rss/videos/list/1.xml |
| 5 | # and |
| 6 | # http://irc.lazle.co/xml/zeropunctuation.xml |
| 7 | # I added Escapist's embed code loaded from their site in the item description. |
| 8 | # A maybe better example is |
| 9 | # http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml |
| 10 | # and |
| 11 | # http://irc.lazle.co/xml/extrapunctuation.xml |
| 12 | # I wanted the whole article text in my feed reader :D |
| 13 | # Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904 |
| 14 | # (from a much earlier version of the script. fully functional, but there are |
| 15 | # differences between them) |
| 16 | |
| 17 | # Future plans: |
| 18 | # Need to make this more accessible for others |
| 19 | # Clean up |
| 20 | |
| 21 | # Copyright 2013 Steven Smith (blha303). All Rights Reserved. |
| 22 | # New BSD license |
| 23 | # http://www.opensource.org/licenses/BSD-3-Clause |
| 24 | |
| 25 | import yql # I like how simple feed importing is with yql |
| 26 | import sys |
| 27 | from time import sleep |
| 28 | from urllib import urlopen |
| 29 | from bs4 import BeautifulSoup as Soup |
| 30 | |
| 31 | yql_env = "http://datatables.org/alltables.env" |
| 32 | |
| 33 | YQL = yql.Public() |
| 34 | |
| 35 | |
| 36 | def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False): |
| 37 | filename = "" # Location output of this file is being read to. used for checking for feed updates |
| 38 | # Copy in the top of the original XML file if you're not sure. |
| 39 | base = """<?xml version="1.0" encoding="ISO-8859-1"?> |
| 40 | <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" > |
| 41 | <channel> |
| 42 | <title>{title}</title> |
| 43 | <link>{link}</link> |
| 44 | <description><![CDATA[{description}]]></description> |
| 45 | <language>en-us</language> |
| 46 | <docs>http://blogs.law.harvard.edu/tech/rss</docs> |
| 47 | """.format(title=title, link=link, description=desc) |
| 48 | |
| 49 | end = """ |
| 50 | </channel> |
| 51 | </rss>""" |
| 52 | |
| 53 | if filename != "": |
| 54 | with open(filename, "r") as f: |
| 55 | olddata = f.read() |
| 56 | oldsoup = Soup(olddata) |
| 57 | else: |
| 58 | oldsoup = None |
| 59 | |
| 60 | query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed |
| 61 | result = YQL.execute(query, {"feed": feedurl}, env=yql_env) |
| 62 | |
| 63 | if not result.rows: |
| 64 | return "No response?" |
| 65 | if oldsoup: |
| 66 | if oldsoup.find('item'): |
| 67 | if oldsoup.find('item').find('title').text == result.rows[0]["title"]: |
| 68 | # No new articles |
| 69 | if debug: print "No new articles" |
| 70 | if aprint: print str(olddata) |
| 71 | return str(olddata) |
| 72 | |
| 73 | if debug: print result.rows |
| 74 | if aprint: print base |
| 75 | items = [] |
| 76 | for row in result.rows: |
| 77 | if debug: print "Description for " + row["title"] |
| 78 | description = row['description'] # use beautifulsoup or something to retrieve the info you want |
| 79 | # Add more tags below if needed in the dest feed |
| 80 | items.append(""" |
| 81 | <item> |
| 82 | <title>{title}</title> |
| 83 | <link>{url}</link> |
| 84 | <guid>{url}</guid> |
| 85 | <description><![CDATA[{description}]]></description> |
| 86 | <pubDate>{date}</pubDate> |
| 87 | <category>{category}</category> |
| 88 | </item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0], |
| 89 | description=description.replace("&", "&"), date=row["pubDate"], |
| 90 | category=row["category"])) |
| 91 | if debug: print "Finished " + title |
| 92 | if aprint: print items[-1] |
| 93 | sleep(1) # to avoid hammering the site in description lookups |
| 94 | if aprint: print end |
| 95 | return base + "".join(items) + end |
| 96 | |
| 97 | if __name__ == "__main__": |
| 98 | if len(sys.argv) > 1: |
| 99 | if sys.argv[1] == "debug": |
| 100 | print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True) |
| 101 | elif sys.argv[1] == "aprint": |
| 102 | main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True) |
| 103 | else: |
| 104 | print main("http://irc.lazle.co/xml/zeropunctuation.xml") |
| 105 |