"""
makeRSS.py

Ad hoc script to scrape Folklore.org's Newton stories and make an RSS feed
while the one on the site is currently not functioning.

'Ili Butterfield - 2008-04-11
"""

import datetime
import re
import sys
import urllib2
import feed.atom
from BeautifulSoup import BeautifulSoup

# Load the page and parse it.
page = urllib2.urlopen("http://www.folklore.org/ProjectView.py?name=Newton&sortOrder=Sort+by+Recent&detail=Show+Titles+and+Info")
soup = BeautifulSoup(page)
page.close()

# Find the number of stories on the site.
flavorText = soup.find("td", width="*").table.table.tr.nextSibling.nextSibling.font.contents[0]
storyCount = int(re.compile("\(\d+").search(flavorText).group()[1:])

# Open the cache file and read in the saved story data.
try:
    cache = open("/home/chz/www/projects/newton/cache.txt", "r+")
except IOError:
    cache = open("/home/chz/www/projects/newton/cache.txt", "w+")
cachedStories = [line.strip().split("\t") for line in cache.readlines()]

# Check if any new stories have been added since the last scrape. If there
# haven't, then we don't need to update.
if len(cachedStories) == storyCount:
    cache.close()
    sys.exit(0)

# Since the stories aren't stamped with the time of post on the site,
# generate a timestamp.
timestamp = datetime.datetime.utcnow().isoformat()

# Find all stories added since the last update. I'm not convinced that the
# sort-by-recent listing of stories actually lists the most recently added
# stories first in all cases. For example, the story I believe is the most
# recently added as I write this comment, "A Kick Under the Table," appears
# second in the list. So, here we go through each story on the page and see
# if its title matches that of any currently cached story. The underlying
# assumption is that, even if new stories don't end up on the top of the
# list, they'll at least actually be on the page.
newStories = []
cachedTitles = [story[0] for story in cachedStories]
storyTable = soup.form.nextSibling.nextSibling

for tr in storyTable("tr"):
    storyLink = tr.td.contents[1]
    title = storyLink.string.__unicode__()

    if title in cachedTitles:
        continue
    
    url = storyLink["href"].replace(" ", "%20").replace("&", "&amp;")
    fields = tr("font")
    description = fields[0].string.__unicode__()
    date = fields[1].contents[1].__unicode__()
    author = fields[2].contents[1].__unicode__()
    newStories.append([title, url, description, date, author, timestamp])

# Stories are displayed on the page latest-first (allegedly), but stored in the
# cache earliest-first, so if we've scraped more than one story, get them in
# the proper order. This is probably most useful just for the initial cache
# population.
newStories.reverse()

# Write out all of the new stories to the cache.
map(lambda x: cache.write("\t".join(x) + "\n"), newStories)
cache.close()

# Create the feed.
xmldoc, atomFeed = feed.atom.new_xmldoc_feed()
atomFeed.title = "Folklore.org: Newton Stories"
atomFeed.id = "tag:quiteajolt.com,2008-04-11:/projects/newton/newton.xml"
atomFeed.updated = timestamp
atomFeed.links.append(feed.atom.Link("http://www.folklore.org/ProjectView.py?project=Newton"))
atomFeed.authors.append(feed.atom.Author("Folklore.org"))

# Add the ten most recently scraped stories to the feed.
cachedStories.extend(newStories)
for story in cachedStories[:-11:-1]:
    entry = feed.atom.Entry()
    entry.title = story[0]
    entry.id = "".join(["tag:folklore.org,", story[5].split("T", 1)[0], ":/", story[1]])
    entry.summary = "".join(["(", story[3], ", ", story[4], ") ", story[2]])
    entry.updated = story[5]
    entry.links.append(feed.atom.Link("http://www.folklore.org/" + story[1]))
    atomFeed.entries.append(entry)

# Write out the feed to disk.
f = open("/home/chz/www/projects/newton/newton.xml", "w")
f.write(str(xmldoc))
f.write("\n")
f.close()