""" makeRSS.py Ad hoc script to scrape Folklore.org's Newton stories and make an RSS feed while the one on the site is currently not functioning. 'Ili Butterfield - 2008-04-11 """ import datetime import re import sys import urllib2 import feed.atom from BeautifulSoup import BeautifulSoup # Load the page and parse it. page = urllib2.urlopen("http://www.folklore.org/ProjectView.py?name=Newton&sortOrder=Sort+by+Recent&detail=Show+Titles+and+Info") soup = BeautifulSoup(page) page.close() # Find the number of stories on the site. flavorText = soup.find("td", width="*").table.table.tr.nextSibling.nextSibling.font.contents[0] storyCount = int(re.compile("\(\d+").search(flavorText).group()[1:]) # Open the cache file and read in the saved story data. try: cache = open("/home/chz/www/projects/newton/cache.txt", "r+") except IOError: cache = open("/home/chz/www/projects/newton/cache.txt", "w+") cachedStories = [line.strip().split("\t") for line in cache.readlines()] # Check if any new stories have been added since the last scrape. If there # haven't, then we don't need to update. if len(cachedStories) == storyCount: cache.close() sys.exit(0) # Since the stories aren't stamped with the time of post on the site, # generate a timestamp. timestamp = datetime.datetime.utcnow().isoformat() # Find all stories added since the last update. I'm not convinced that the # sort-by-recent listing of stories actually lists the most recently added # stories first in all cases. For example, the story I believe is the most # recently added as I write this comment, "A Kick Under the Table," appears # second in the list. So, here we go through each story on the page and see # if its title matches that of any currently cached story. The underlying # assumption is that, even if new stories don't end up on the top of the # list, they'll at least actually be on the page. newStories = [] cachedTitles = [story[0] for story in cachedStories] storyTable = soup.form.nextSibling.nextSibling for tr in storyTable("tr"): storyLink = tr.td.contents[1] title = storyLink.string.__unicode__() if title in cachedTitles: continue url = storyLink["href"].replace(" ", "%20").replace("&", "&") fields = tr("font") description = fields[0].string.__unicode__() date = fields[1].contents[1].__unicode__() author = fields[2].contents[1].__unicode__() newStories.append([title, url, description, date, author, timestamp]) # Stories are displayed on the page latest-first (allegedly), but stored in the # cache earliest-first, so if we've scraped more than one story, get them in # the proper order. This is probably most useful just for the initial cache # population. newStories.reverse() # Write out all of the new stories to the cache. map(lambda x: cache.write("\t".join(x) + "\n"), newStories) cache.close() # Create the feed. xmldoc, atomFeed = feed.atom.new_xmldoc_feed() atomFeed.title = "Folklore.org: Newton Stories" atomFeed.id = "tag:quiteajolt.com,2008-04-11:/projects/newton/newton.xml" atomFeed.updated = timestamp atomFeed.links.append(feed.atom.Link("http://www.folklore.org/ProjectView.py?project=Newton")) atomFeed.authors.append(feed.atom.Author("Folklore.org")) # Add the ten most recently scraped stories to the feed. cachedStories.extend(newStories) for story in cachedStories[:-11:-1]: entry = feed.atom.Entry() entry.title = story[0] entry.id = "".join(["tag:folklore.org,", story[5].split("T", 1)[0], ":/", story[1]]) entry.summary = "".join(["(", story[3], ", ", story[4], ") ", story[2]]) entry.updated = story[5] entry.links.append(feed.atom.Link("http://www.folklore.org/" + story[1])) atomFeed.entries.append(entry) # Write out the feed to disk. f = open("/home/chz/www/projects/newton/newton.xml", "w") f.write(str(xmldoc)) f.write("\n") f.close()