# Author: Jonathan Ellis import datetime, re, time, logging, traceback from google.appengine.api.urlfetch import fetch from google.appengine.api import memcache import PyRSS2Gen from BeautifulSoup import BeautifulSoup logging.getLogger().setLevel(logging.DEBUG) url = 'http://www.python.org/community/jobs/' def items(): response = fetch(url) soup = BeautifulSoup(response.content) L = soup.fetch('div', {'class': 'section'}) for div in L: if not div.fetch('h3', recursive=False): continue title = 'no title' href = url for a in div.fetch('a'): try: title = a.renderContents() href = a['href'] except: pass else: break logging.debug('title is ' + title) # date handling... bleh try: when_raw = div.first('em').renderContents() except: # crap html... m = re.search('''(.*?)<''', div.renderContents()) if m: when_raw = m.group(1) else: when_raw = None # some don't even give a date try: # could throw either b/c when_raw is None, or no date was found m = re.search('''(\d+)-(\w\w\w)-(\d+)''', when_raw) (d, m, y) = m.groups() t = time.strptime('%s %s %s' % (d, m.capitalize(), y), '%d %b %Y') # generates GMT, but hell, maybe that's what pyjobs means :P when_t = time.localtime(time.mktime(t)) except: # "right now" should be close enough for most purposes when_t = time.localtime(0) when_d = datetime.datetime(*when_t[:-3]) desc = '\n'.join([str(p) for p in div.fetch('p')[1:]]) item = PyRSS2Gen.RSSItem( title = title, link = href, description = desc, guid = PyRSS2Gen.Guid(href), pubDate = when_d) yield item def xml(): xml = memcache.get('xml') if not xml: rss = PyRSS2Gen.RSS2( title = 'Unofficial Python Job Board Feed', link = url, description = """ This RSS feed is autogenerated from the handwritten html that makes up the python.org jobs page. This page is... irregular in places. Notably, I don't always get all or even most of the description, but at least you'll know when a new job goes up. """, lastBuildDate = datetime.datetime.now(), items = items()) xml = rss.to_xml() memcache.add('xml', xml, 3600) return xml def main(): print "Content-Type: text/xml\n\n" + xml() if __name__ == '__main__': main()