Revision of rssfeedrewrite.py

1

+

# RSS feed rewriter

2

+

# For adding more content from the target page that isn't included normally

3

+

# For an example output, compare

4

+

# http://www.escapistmagazine.com/rss/videos/list/1.xml

5

+

# and

6

+

# http://irc.lazle.co/xml/zeropunctuation.xml

7

+

# I added Escapist's embed code loaded from their site in the item description.

8

+

# A maybe better example is

9

+

# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml

10

+

# and

11

+

# http://irc.lazle.co/xml/extrapunctuation.xml

12

+

# I wanted the whole article text in my feed reader :D

13

+

# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904

14

+

# (from a much earlier version of the script. fully functional, but there are

15

+

# differences between them)

16

+

17

+

# Future plans:

18

+

# Need to make this more accessible for others

19

+

# Clean up

20

+

21

+

22

+

# New BSD license

23

+

# http://www.opensource.org/licenses/BSD-3-Clause

24

+

25

+

import yql # I like how simple feed importing is with yql

26

+

import sys

27

+

from time import sleep

28

+

from urllib import urlopen

29

+

from bs4 import BeautifulSoup as Soup

30

+

31

+

yql_env = "http://datatables.org/alltables.env"

32

+

33

+

YQL = yql.Public()

34

+

35

+

36

+

def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):

37

+

filename = "" # Location output of this file is being read to. used for checking for feed updates

38

+

# Copy in the top of the original XML file if you're not sure.

39

+

base = """<?xml version="1.0" encoding="ISO-8859-1"?>

40

+

41

+

42

+

<title>{title}</title>

43

+

44

+

45

+

46

+

<docs>http://blogs.law.harvard.edu/tech/rss</docs>

47

+

""".format(title=title, link=link, description=desc)

48

+

49

+

end = """

50

+

</channel>

51

+

</rss>"""

52

+

53

+

if filename != "":

54

+

with open(filename, "r") as f:

55

+

olddata = f.read()

56

+

oldsoup = Soup(olddata)

57

+

else:

58

+

oldsoup = None

59

+

60

+

query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed

61

+

result = YQL.execute(query, {"feed": feedurl}, env=yql_env)

62

+

63

+

if not result.rows:

64

+

return "No response?"

65

+

if oldsoup:

66

+

if oldsoup.find('item'):

67

+

if oldsoup.find('item').find('title').text == result.rows[0]["title"]:

68

+

# No new articles

69

+

if debug: print "No new articles"

70

+

if aprint: print str(olddata)

71

+

return str(olddata)

72

+

73

+

if debug: print result.rows

74

+

if aprint: print base

75

+

items = []

76

+

for row in result.rows:

77

+

if debug: print "Description for " + row["title"]

78

+

description = row['description'] # use beautifulsoup or something to retrieve the info you want

79

+

# Add more tags below if needed in the dest feed

80

+

items.append("""

81

+

<item>

82

+

<title>{title}</title>

83

+

84

+

85

+

86

+

87

+

<category>{category}</category>

88

+

</item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0],

89

+

description=description.replace("&", "&"), date=row["pubDate"],

90

+

category=row["category"]))

91

+

if debug: print "Finished " + title

92

+

if aprint: print items[-1]

93

+

sleep(1) # to avoid hammering the site in description lookups

94

+

if aprint: print end

95

+

return base + "".join(items) + end

96

+

97

+

if __name__ == "__main__":

98

+

if len(sys.argv) > 1:

99

+

if sys.argv[1] == "debug":

100

+

print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)

101

+

elif sys.argv[1] == "aprint":

102

+

main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)

103

+

else:

104

+

print main("http://irc.lazle.co/xml/zeropunctuation.xml")

aly / rssfeedrewrite.py

Steven Smith revised this gist 1375732306. Go to revision

		@@ -0,0 +1,104 @@
1	+	# RSS feed rewriter
2	+	# For adding more content from the target page that isn't included normally
3	+	# For an example output, compare
4	+	# http://www.escapistmagazine.com/rss/videos/list/1.xml
5	+	# and
6	+	# http://irc.lazle.co/xml/zeropunctuation.xml
7	+	# I added Escapist's embed code loaded from their site in the item description.
8	+	# A maybe better example is
9	+	# http://www.escapistmagazine.com/rss/articles/columns/extra-punctuation.xml
10	+	# and
11	+	# http://irc.lazle.co/xml/extrapunctuation.xml
12	+	# I wanted the whole article text in my feed reader :D
13	+	# Here's an example usage: https://gist.github.com/blha303/36a1c9eef45cf75df904
14	+	# (from a much earlier version of the script. fully functional, but there are
15	+	# differences between them)
16	+
17	+	# Future plans:
18	+	# Need to make this more accessible for others
19	+	# Clean up
20	+
21	+	# Copyright 2013 Steven Smith (blha303). All Rights Reserved.
22	+	# New BSD license
23	+	# http://www.opensource.org/licenses/BSD-3-Clause
24	+
25	+	import yql # I like how simple feed importing is with yql
26	+	import sys
27	+	from time import sleep
28	+	from urllib import urlopen
29	+	from bs4 import BeautifulSoup as Soup
30	+
31	+	yql_env = "http://datatables.org/alltables.env"
32	+
33	+	YQL = yql.Public()
34	+
35	+
36	+	def main(feedurl, title="A feed!", link="http://google.com", desc=":)", debug=False, aprint=False):
37	+	filename = "" # Location output of this file is being read to. used for checking for feed updates
38	+	# Copy in the top of the original XML file if you're not sure.
39	+	base = """<?xml version="1.0" encoding="ISO-8859-1"?>
40	+	<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" >
41	+	<channel>
42	+	<title>{title}</title>
43	+	<link>{link}</link>
44	+	<description><![CDATA[{description}]]></description>
45	+	<language>en-us</language>
46	+	<docs>http://blogs.law.harvard.edu/tech/rss</docs>
47	+	""".format(title=title, link=link, description=desc)
48	+
49	+	end = """
50	+	</channel>
51	+	</rss>"""
52	+
53	+	if filename != "":
54	+	with open(filename, "r") as f:
55	+	olddata = f.read()
56	+	oldsoup = Soup(olddata)
57	+	else:
58	+	oldsoup = None
59	+
60	+	query = "SELECT title,link,description,pubDate,category from rss where url=@feed" # add more fields to retrieve if needed
61	+	result = YQL.execute(query, {"feed": feedurl}, env=yql_env)
62	+
63	+	if not result.rows:
64	+	return "No response?"
65	+	if oldsoup:
66	+	if oldsoup.find('item'):
67	+	if oldsoup.find('item').find('title').text == result.rows[0]["title"]:
68	+	# No new articles
69	+	if debug: print "No new articles"
70	+	if aprint: print str(olddata)
71	+	return str(olddata)
72	+
73	+	if debug: print result.rows
74	+	if aprint: print base
75	+	items = []
76	+	for row in result.rows:
77	+	if debug: print "Description for " + row["title"]
78	+	description = row['description'] # use beautifulsoup or something to retrieve the info you want
79	+	# Add more tags below if needed in the dest feed
80	+	items.append("""
81	+	<item>
82	+	<title>{title}</title>
83	+	<link>{url}</link>
84	+	<guid>{url}</guid>
85	+	<description><![CDATA[{description}]]></description>
86	+	<pubDate>{date}</pubDate>
87	+	<category>{category}</category>
88	+	</item>""".format(title=row["title"].replace("&", "&"), url=row["link"].split("?")[0],
89	+	description=description.replace("&", "&"), date=row["pubDate"],
90	+	category=row["category"]))
91	+	if debug: print "Finished " + title
92	+	if aprint: print items[-1]
93	+	sleep(1) # to avoid hammering the site in description lookups
94	+	if aprint: print end
95	+	return base + "".join(items) + end
96	+
97	+	if __name__ == "__main__":
98	+	if len(sys.argv) > 1:
99	+	if sys.argv[1] == "debug":
100	+	print main("http://irc.lazle.co/xml/zeropunctuation.xml", debug=True)
101	+	elif sys.argv[1] == "aprint":
102	+	main("http://irc.lazle.co/xml/zeropunctuation.xml", aprint=True)
103	+	else:
104	+	print main("http://irc.lazle.co/xml/zeropunctuation.xml")