2009-03-13 Xan Lopez <xlopez@igalia.com>
[WebKit-https.git] / PlanetWebKit / planet / planet / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: UTF-8 -*-
3 """Planet aggregator library.
4
5 This package is a library for developing web sites or software that
6 aggregate RSS, CDF and Atom feeds taken from elsewhere into a single,
7 combined feed.
8 """
9
10 __version__ = "2.0"
11 __authors__ = [ "Scott James Remnant <scott@netsplit.com>",
12                 "Jeff Waugh <jdub@perkypants.org>" ]
13 __license__ = "Python"
14
15
16 # Modules available without separate import
17 import cache
18 import feedparser
19 import sanitize
20 import htmltmpl
21 import sgmllib
22 try:
23     import logging
24 except:
25     import compat_logging as logging
26
27 # Limit the effect of "from planet import *"
28 __all__ = ("cache", "feedparser", "htmltmpl", "logging",
29            "Planet", "Channel", "NewsItem")
30
31
32 import os
33 import md5
34 import time
35 import dbhash
36 import re
37
38 try: 
39     from xml.sax.saxutils import escape
40 except:
41     def escape(data):
42         return data.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
43
44 # Version information (for generator headers)
45 VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__)
46
47 # Default User-Agent header to send when retreiving feeds
48 USER_AGENT = VERSION + " " + feedparser.USER_AGENT
49
50 # Default cache directory
51 CACHE_DIRECTORY = "cache"
52
53 # Default number of items to display from a new feed
54 NEW_FEED_ITEMS = 10
55
56 # Useful common date/time formats
57 TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00"
58 TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000"
59
60
61 # Log instance to use here
62 log = logging.getLogger("planet")
63 try:
64     log.warning
65 except:
66     log.warning = log.warn
67
68 # Defaults for the template file config sections
69 ENCODING        = "utf-8"
70 ITEMS_PER_PAGE  = 60
71 DAYS_PER_PAGE   = 0
72 OUTPUT_DIR      = "output"
73 DATE_FORMAT     = "%B %d, %Y %I:%M %p"
74 NEW_DATE_FORMAT = "%B %d, %Y"
75 ACTIVITY_THRESHOLD = 0
76
77 class stripHtml(sgmllib.SGMLParser):
78     "remove all tags from the data"
79     def __init__(self, data):
80         sgmllib.SGMLParser.__init__(self)
81         self.result=''
82         self.feed(data)
83         self.close()
84     def handle_data(self, data):
85         if data: self.result+=data
86
87 def template_info(item, date_format):
88     """Produce a dictionary of template information."""
89     info = {}
90     for key in item.keys():
91         if item.key_type(key) == item.DATE:
92             date = item.get_as_date(key)
93             info[key] = time.strftime(date_format, date)
94             info[key + "_iso"] = time.strftime(TIMEFMT_ISO, date)
95             info[key + "_822"] = time.strftime(TIMEFMT_822, date)
96         else:
97             info[key] = item[key]
98     if 'title' in item.keys():
99         info['title_plain'] = stripHtml(info['title']).result
100
101     return info
102
103
104 class Planet:
105     """A set of channels.
106
107     This class represents a set of channels for which the items will
108     be aggregated together into one combined feed.
109
110     Properties:
111         user_agent      User-Agent header to fetch feeds with.
112         cache_directory Directory to store cached channels in.
113         new_feed_items  Number of items to display from a new feed.
114         filter          A regular expression that articles must match.
115         exclude         A regular expression that articles must not match.
116     """
117     def __init__(self, config):
118         self.config = config
119
120         self._channels = []
121
122         self.user_agent = USER_AGENT
123         self.cache_directory = CACHE_DIRECTORY
124         self.new_feed_items = NEW_FEED_ITEMS
125         self.filter = None
126         self.exclude = None
127
128     def tmpl_config_get(self, template, option, default=None, raw=0, vars=None):
129         """Get a template value from the configuration, with a default."""
130         if self.config.has_option(template, option):
131             return self.config.get(template, option, raw=raw, vars=None)
132         elif self.config.has_option("Planet", option):
133             return self.config.get("Planet", option, raw=raw, vars=None)
134         else:
135             return default
136
137     def gather_channel_info(self, template_file="Planet"):
138         date_format = self.tmpl_config_get(template_file,
139                                       "date_format", DATE_FORMAT, raw=1)
140
141         activity_threshold = int(self.tmpl_config_get(template_file,
142                                             "activity_threshold",
143                                             ACTIVITY_THRESHOLD))
144
145         if activity_threshold:
146             activity_horizon = \
147                 time.gmtime(time.time()-86400*activity_threshold)
148         else:
149             activity_horizon = 0
150
151         channels = {}
152         channels_list = []
153         for channel in self.channels(hidden=1):
154             channels[channel] = template_info(channel, date_format)
155             channels_list.append(channels[channel])
156
157             # identify inactive feeds
158             if activity_horizon:
159                 latest = channel.items(sorted=1)
160                 if len(latest)==0 or latest[0].date < activity_horizon:
161                     channels[channel]["message"] = \
162                         "no activity in %d days" % activity_threshold
163
164             # report channel level errors
165             if not channel.url_status: continue
166             status = int(channel.url_status)
167             if status == 403:
168                channels[channel]["message"] = "403: forbidden"
169             elif status == 404:
170                channels[channel]["message"] = "404: not found"
171             elif status == 408:
172                channels[channel]["message"] = "408: request timeout"
173             elif status == 410:
174                channels[channel]["message"] = "410: gone"
175             elif status == 500:
176                channels[channel]["message"] = "internal server error"
177             elif status >= 400:
178                channels[channel]["message"] = "http status %s" % status
179
180         return channels, channels_list
181
182     def gather_items_info(self, channels, template_file="Planet", channel_list=None):
183         items_list = []
184         prev_date = []
185         prev_channel = None
186
187         date_format = self.tmpl_config_get(template_file,
188                                       "date_format", DATE_FORMAT, raw=1)
189         items_per_page = int(self.tmpl_config_get(template_file,
190                                       "items_per_page", ITEMS_PER_PAGE))
191         days_per_page = int(self.tmpl_config_get(template_file,
192                                       "days_per_page", DAYS_PER_PAGE))
193         new_date_format = self.tmpl_config_get(template_file,
194                                       "new_date_format", NEW_DATE_FORMAT, raw=1)
195
196         for newsitem in self.items(max_items=items_per_page,
197                                    max_days=days_per_page,
198                                    channels=channel_list):
199             item_info = template_info(newsitem, date_format)
200             chan_info = channels[newsitem._channel]
201             for k, v in chan_info.items():
202                 item_info["channel_" + k] = v
203     
204             # Check for the start of a new day
205             if prev_date[:3] != newsitem.date[:3]:
206                 prev_date = newsitem.date
207                 item_info["new_date"] = time.strftime(new_date_format,
208                                                       newsitem.date)
209     
210             # Check for the start of a new channel
211             if item_info.has_key("new_date") \
212                    or prev_channel != newsitem._channel:
213                 prev_channel = newsitem._channel
214                 item_info["new_channel"] = newsitem._channel.url
215     
216             items_list.append(item_info)
217
218         return items_list
219
220     def run(self, planet_name, planet_link, template_files, offline = False):
221         log = logging.getLogger("planet.runner")
222
223         # Create a planet
224         log.info("Loading cached data")
225         if self.config.has_option("Planet", "cache_directory"):
226             self.cache_directory = self.config.get("Planet", "cache_directory")
227         if self.config.has_option("Planet", "new_feed_items"):
228             self.new_feed_items  = int(self.config.get("Planet", "new_feed_items"))
229         self.user_agent = "%s +%s %s" % (planet_name, planet_link,
230                                               self.user_agent)
231         if self.config.has_option("Planet", "filter"):
232             self.filter = self.config.get("Planet", "filter")
233
234         # The other configuration blocks are channels to subscribe to
235         for feed_url in self.config.sections():
236             if feed_url == "Planet" or feed_url in template_files:
237                 continue
238
239             # Create a channel, configure it and subscribe it
240             channel = Channel(self, feed_url)
241             self.subscribe(channel)
242
243             # Update it
244             try:
245                 if not offline and not channel.url_status == '410':
246                     channel.update()
247             except KeyboardInterrupt:
248                 raise
249             except:
250                 log.exception("Update of <%s> failed", feed_url)
251
252     def generate_all_files(self, template_files, planet_name,
253                 planet_link, planet_feed, owner_name, owner_email):
254         
255         log = logging.getLogger("planet.runner")
256         # Go-go-gadget-template
257         for template_file in template_files:
258             manager = htmltmpl.TemplateManager()
259             log.info("Processing template %s", template_file)
260             try:
261                 template = manager.prepare(template_file)
262             except htmltmpl.TemplateError:
263                 template = manager.prepare(os.path.basename(template_file))
264             # Read the configuration
265             output_dir = self.tmpl_config_get(template_file,
266                                          "output_dir", OUTPUT_DIR)
267             date_format = self.tmpl_config_get(template_file,
268                                           "date_format", DATE_FORMAT, raw=1)
269             encoding = self.tmpl_config_get(template_file, "encoding", ENCODING)
270         
271             # We treat each template individually
272             base = os.path.splitext(os.path.basename(template_file))[0]
273             url = os.path.join(planet_link, base)
274             output_file = os.path.join(output_dir, base)
275
276             # Gather information
277             channels, channels_list = self.gather_channel_info(template_file) 
278             items_list = self.gather_items_info(channels, template_file) 
279
280             # Gather item information
281     
282             # Process the template
283             tp = htmltmpl.TemplateProcessor(html_escape=0)
284             tp.set("Items", items_list)
285             tp.set("Channels", channels_list)
286         
287             # Generic information
288             tp.set("generator",   VERSION)
289             tp.set("name",        planet_name)
290             tp.set("link",        planet_link)
291             tp.set("owner_name",  owner_name)
292             tp.set("owner_email", owner_email)
293             tp.set("url",         url)
294         
295             if planet_feed:
296                 tp.set("feed", planet_feed)
297                 tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom')
298             
299             # Update time
300             date = time.gmtime()
301             tp.set("date",        time.strftime(date_format, date))
302             tp.set("date_iso",    time.strftime(TIMEFMT_ISO, date))
303             tp.set("date_822",    time.strftime(TIMEFMT_822, date))
304
305             try:
306                 log.info("Writing %s", output_file)
307                 output_fd = open(output_file, "w")
308                 if encoding.lower() in ("utf-8", "utf8"):
309                     # UTF-8 output is the default because we use that internally
310                     output_fd.write(tp.process(template))
311                 elif encoding.lower() in ("xml", "html", "sgml"):
312                     # Magic for Python 2.3 users
313                     output = tp.process(template).decode("utf-8")
314                     output_fd.write(output.encode("ascii", "xmlcharrefreplace"))
315                 else:
316                     # Must be a "known" encoding
317                     output = tp.process(template).decode("utf-8")
318                     output_fd.write(output.encode(encoding, "replace"))
319                 output_fd.close()
320             except KeyboardInterrupt:
321                 raise
322             except:
323                 log.exception("Write of %s failed", output_file)
324
325     def channels(self, hidden=0, sorted=1):
326         """Return the list of channels."""
327         channels = []
328         for channel in self._channels:
329             if hidden or not channel.has_key("hidden"):
330                 channels.append((channel.name, channel))
331
332         if sorted:
333             channels.sort()
334
335         return [ c[-1] for c in channels ]
336
337     def find_by_basename(self, basename):
338         for channel in self._channels:
339             if basename == channel.cache_basename(): return channel
340
341     def subscribe(self, channel):
342         """Subscribe the planet to the channel."""
343         self._channels.append(channel)
344
345     def unsubscribe(self, channel):
346         """Unsubscribe the planet from the channel."""
347         self._channels.remove(channel)
348
349     def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None):
350         """Return an optionally filtered list of items in the channel.
351
352         The filters are applied in the following order:
353
354         If hidden is true then items in hidden channels and hidden items
355         will be returned.
356
357         If sorted is true then the item list will be sorted with the newest
358         first.
359
360         If max_items is non-zero then this number of items, at most, will
361         be returned.
362
363         If max_days is non-zero then any items older than the newest by
364         this number of days won't be returned.  Requires sorted=1 to work.
365
366
367         The sharp-eyed will note that this looks a little strange code-wise,
368         it turns out that Python gets *really* slow if we try to sort the
369         actual items themselves.  Also we use mktime here, but it's ok
370         because we discard the numbers and just need them to be relatively
371         consistent between each other.
372         """
373         planet_filter_re = None
374         if self.filter:
375             planet_filter_re = re.compile(self.filter, re.I)
376         planet_exclude_re = None
377         if self.exclude:
378             planet_exclude_re = re.compile(self.exclude, re.I)
379             
380         items = []
381         seen_guids = {}
382         if not channels: channels=self.channels(hidden=hidden, sorted=0)
383         for channel in channels:
384             for item in channel._items.values():
385                 if hidden or not item.has_key("hidden"):
386
387                     channel_filter_re = None
388                     if channel.filter:
389                         channel_filter_re = re.compile(channel.filter,
390                                                        re.I)
391                     channel_exclude_re = None
392                     if channel.exclude:
393                         channel_exclude_re = re.compile(channel.exclude,
394                                                         re.I)
395                     if (planet_filter_re or planet_exclude_re \
396                         or channel_filter_re or channel_exclude_re):
397                         title = ""
398                         if item.has_key("title"):
399                             title = item.title
400                         content = item.get_content("content")
401
402                     if planet_filter_re:
403                         if not (planet_filter_re.search(title) \
404                                 or planet_filter_re.search(content)):
405                             continue
406
407                     if planet_exclude_re:
408                         if (planet_exclude_re.search(title) \
409                             or planet_exclude_re.search(content)):
410                             continue
411
412                     if channel_filter_re:
413                         if not (channel_filter_re.search(title) \
414                                 or channel_filter_re.search(content)):
415                             continue
416
417                     if channel_exclude_re:
418                         if (channel_exclude_re.search(title) \
419                             or channel_exclude_re.search(content)):
420                             continue
421
422                     if not seen_guids.has_key(item.id):
423                         seen_guids[item.id] = 1;
424                         items.append((time.mktime(item.date), item.order, item))
425
426         # Sort the list
427         if sorted:
428             items.sort()
429             items.reverse()
430
431         # Apply max_items filter
432         if len(items) and max_items:
433             items = items[:max_items]
434
435         # Apply max_days filter
436         if len(items) and max_days:
437             max_count = 0
438             max_time = items[0][0] - max_days * 84600
439             for item in items:
440                 if item[0] > max_time:
441                     max_count += 1
442                 else:
443                     items = items[:max_count]
444                     break
445
446         return [ i[-1] for i in items ]
447
448 class Channel(cache.CachedInfo):
449     """A list of news items.
450
451     This class represents a list of news items taken from the feed of
452     a website or other source.
453
454     Properties:
455         url             URL of the feed.
456         url_etag        E-Tag of the feed URL.
457         url_modified    Last modified time of the feed URL.
458         url_status      Last HTTP status of the feed URL.
459         hidden          Channel should be hidden (True if exists).
460         name            Name of the feed owner, or feed title.
461         next_order      Next order number to be assigned to NewsItem
462
463         updated         Correct UTC-Normalised update time of the feed.
464         last_updated    Correct UTC-Normalised time the feed was last updated.
465
466         id              An identifier the feed claims is unique (*).
467         title           One-line title (*).
468         link            Link to the original format feed (*).
469         tagline         Short description of the feed (*).
470         info            Longer description of the feed (*).
471
472         modified        Date the feed claims to have been modified (*).
473
474         author          Name of the author (*).
475         publisher       Name of the publisher (*).
476         generator       Name of the feed generator (*).
477         category        Category name (*).
478         copyright       Copyright information for humans to read (*).
479         license         Link to the licence for the content (*).
480         docs            Link to the specification of the feed format (*).
481         language        Primary language (*).
482         errorreportsto  E-Mail address to send error reports to (*).
483
484         image_url       URL of an associated image (*).
485         image_link      Link to go with the associated image (*).
486         image_title     Alternative text of the associated image (*).
487         image_width     Width of the associated image (*).
488         image_height    Height of the associated image (*).
489
490         filter          A regular expression that articles must match.
491         exclude         A regular expression that articles must not match.
492
493     Properties marked (*) will only be present if the original feed
494     contained them.  Note that the optional 'modified' date field is simply
495     a claim made by the item and parsed from the information given, 'updated'
496     (and 'last_updated') are far more reliable sources of information.
497
498     Some feeds may define additional properties to those above.
499     """
500     IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories",
501                    "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit")
502
503     def __init__(self, planet, url):
504         if not os.path.isdir(planet.cache_directory):
505             os.makedirs(planet.cache_directory)
506         cache_filename = cache.filename(planet.cache_directory, url)
507         cache_file = dbhash.open(cache_filename, "c", 0666)
508
509         cache.CachedInfo.__init__(self, cache_file, url, root=1)
510
511         self._items = {}
512         self._planet = planet
513         self._expired = []
514         self.url = url
515         # retain the original URL for error reporting
516         self.configured_url = url
517         self.url_etag = None
518         self.url_status = None
519         self.url_modified = None
520         self.name = None
521         self.updated = None
522         self.last_updated = None
523         self.filter = None
524         self.exclude = None
525         self.next_order = "0"
526         self.cache_read()
527         self.cache_read_entries()
528
529         if planet.config.has_section(url):
530             for option in planet.config.options(url):
531                 value = planet.config.get(url, option)
532                 self.set_as_string(option, value, cached=0)
533
534     def has_item(self, id_):
535         """Check whether the item exists in the channel."""
536         return self._items.has_key(id_)
537
538     def get_item(self, id_):
539         """Return the item from the channel."""
540         return self._items[id_]
541
542     # Special methods
543     __contains__ = has_item
544
545     def items(self, hidden=0, sorted=0):
546         """Return the item list."""
547         items = []
548         for item in self._items.values():
549             if hidden or not item.has_key("hidden"):
550                 items.append((time.mktime(item.date), item.order, item))
551
552         if sorted:
553             items.sort()
554             items.reverse()
555
556         return [ i[-1] for i in items ]
557
558     def __iter__(self):
559         """Iterate the sorted item list."""
560         return iter(self.items(sorted=1))
561
562     def cache_read_entries(self):
563         """Read entry information from the cache."""
564         keys = self._cache.keys()
565         for key in keys:
566             if key.find(" ") != -1: continue
567             if self.has_key(key): continue
568
569             item = NewsItem(self, key)
570             self._items[key] = item
571
572     def cache_basename(self):
573         return cache.filename('',self._id)
574
575     def cache_write(self, sync=1):
576         """Write channel and item information to the cache."""
577         for item in self._items.values():
578             item.cache_write(sync=0)
579         for item in self._expired:
580             item.cache_clear(sync=0)
581         cache.CachedInfo.cache_write(self, sync)
582
583         self._expired = []
584
585     def feed_information(self):
586         """
587         Returns a description string for the feed embedded in this channel.
588
589         This will usually simply be the feed url embedded in <>, but in the
590         case where the current self.url has changed from the original
591         self.configured_url the string will contain both pieces of information.
592         This is so that the URL in question is easier to find in logging
593         output: getting an error about a URL that doesn't appear in your config
594         file is annoying.
595         """
596         if self.url == self.configured_url:
597             return "<%s>" % self.url
598         else:
599             return "<%s> (formerly <%s>)" % (self.url, self.configured_url)
600
601     def update(self):
602         """Download the feed to refresh the information.
603
604         This does the actual work of pulling down the feed and if it changes
605         updates the cached information about the feed and entries within it.
606         """
607         info = feedparser.parse(self.url,
608                                 etag=self.url_etag, modified=self.url_modified,
609                                 agent=self._planet.user_agent)
610         if info.has_key("status"):
611            self.url_status = str(info.status)
612         elif info.has_key("entries") and len(info.entries)>0:
613            self.url_status = str(200)
614         elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout':
615            self.url_status = str(408)
616         else:
617            self.url_status = str(500)
618
619         if self.url_status == '301' and \
620            (info.has_key("entries") and len(info.entries)>0):
621             log.warning("Feed has moved from <%s> to <%s>", self.url, info.url)
622             try:
623                 os.link(cache.filename(self._planet.cache_directory, self.url),
624                         cache.filename(self._planet.cache_directory, info.url))
625             except:
626                 pass
627             self.url = info.url
628         elif self.url_status == '304':
629             log.info("Feed %s unchanged", self.feed_information())
630             return
631         elif self.url_status == '410':
632             log.info("Feed %s gone", self.feed_information())
633             self.cache_write()
634             return
635         elif self.url_status == '408':
636             log.warning("Feed %s timed out", self.feed_information())
637             return
638         elif int(self.url_status) >= 400:
639             log.error("Error %s while updating feed %s",
640                       self.url_status, self.feed_information())
641             return
642         else:
643             log.info("Updating feed %s", self.feed_information())
644
645         self.url_etag = info.has_key("etag") and info.etag or None
646         self.url_modified = info.has_key("modified") and info.modified or None
647         if self.url_etag is not None:
648             log.debug("E-Tag: %s", self.url_etag)
649         if self.url_modified is not None:
650             log.debug("Last Modified: %s",
651                       time.strftime(TIMEFMT_ISO, self.url_modified))
652
653         self.update_info(info.feed)
654         self.update_entries(info.entries)
655         self.cache_write()
656
657     def update_info(self, feed):
658         """Update information from the feed.
659
660         This reads the feed information supplied by feedparser and updates
661         the cached information about the feed.  These are the various
662         potentially interesting properties that you might care about.
663         """
664         for key in feed.keys():
665             if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
666                 # Ignored fields
667                 pass
668             elif feed.has_key(key + "_parsed"):
669                 # Ignore unparsed date fields
670                 pass
671             elif key.endswith("_detail"):
672                 # retain name and  email sub-fields
673                 if feed[key].has_key('name') and feed[key].name:
674                     self.set_as_string(key.replace("_detail","_name"), \
675                         feed[key].name)
676                 if feed[key].has_key('email') and feed[key].email:
677                     self.set_as_string(key.replace("_detail","_email"), \
678                         feed[key].email)
679             elif key == "items":
680                 # Ignore items field
681                 pass
682             elif key.endswith("_parsed"):
683                 # Date fields
684                 if feed[key] is not None:
685                     self.set_as_date(key[:-len("_parsed")], feed[key])
686             elif key == "image":
687                 # Image field: save all the information
688                 if feed[key].has_key("url"):
689                     self.set_as_string(key + "_url", feed[key].url)
690                 if feed[key].has_key("link"):
691                     self.set_as_string(key + "_link", feed[key].link)
692                 if feed[key].has_key("title"):
693                     self.set_as_string(key + "_title", feed[key].title)
694                 if feed[key].has_key("width"):
695                     self.set_as_string(key + "_width", str(feed[key].width))
696                 if feed[key].has_key("height"):
697                     self.set_as_string(key + "_height", str(feed[key].height))
698             elif isinstance(feed[key], (str, unicode)):
699                 # String fields
700                 try:
701                     detail = key + '_detail'
702                     if feed.has_key(detail) and feed[detail].has_key('type'):
703                         if feed[detail].type == 'text/html':
704                             feed[key] = sanitize.HTML(feed[key])
705                         elif feed[detail].type == 'text/plain':
706                             feed[key] = escape(feed[key])
707                     self.set_as_string(key, feed[key])
708                 except KeyboardInterrupt:
709                     raise
710                 except:
711                     log.exception("Ignored '%s' of <%s>, unknown format",
712                                   key, self.url)
713
714     def update_entries(self, entries):
715         """Update entries from the feed.
716
717         This reads the entries supplied by feedparser and updates the
718         cached information about them.  It's at this point we update
719         the 'updated' timestamp and keep the old one in 'last_updated',
720         these provide boundaries for acceptable entry times.
721
722         If this is the first time a feed has been updated then most of the
723         items will be marked as hidden, according to Planet.new_feed_items.
724
725         If the feed does not contain items which, according to the sort order,
726         should be there; those items are assumed to have been expired from
727         the feed or replaced and are removed from the cache.
728         """
729         if not len(entries):
730             return
731
732         self.last_updated = self.updated
733         self.updated = time.gmtime()
734
735         new_items = []
736         feed_items = []
737         for entry in entries:
738             # Try really hard to find some kind of unique identifier
739             if entry.has_key("id"):
740                 entry_id = cache.utf8(entry.id)
741             elif entry.has_key("link"):
742                 entry_id = cache.utf8(entry.link)
743             elif entry.has_key("title"):
744                 entry_id = (self.url + "/"
745                             + md5.new(cache.utf8(entry.title)).hexdigest())
746             elif entry.has_key("summary"):
747                 entry_id = (self.url + "/"
748                             + md5.new(cache.utf8(entry.summary)).hexdigest())
749             else:
750                 log.error("Unable to find or generate id, entry ignored")
751                 continue
752
753             # Create the item if necessary and update
754             if self.has_item(entry_id):
755                 item = self._items[entry_id]
756             else:
757                 item = NewsItem(self, entry_id)
758                 self._items[entry_id] = item
759                 new_items.append(item)
760             item.update(entry)
761             feed_items.append(entry_id)
762
763             # Hide excess items the first time through
764             if self.last_updated is None  and self._planet.new_feed_items \
765                    and len(feed_items) > self._planet.new_feed_items:
766                 item.hidden = "yes"
767                 log.debug("Marked <%s> as hidden (new feed)", entry_id)
768
769         # Assign order numbers in reverse
770         new_items.reverse()
771         for item in new_items:
772             item.order = self.next_order = str(int(self.next_order) + 1)
773
774         # Check for expired or replaced items
775         feed_count = len(feed_items)
776         log.debug("Items in Feed: %d", feed_count)
777         for item in self.items(sorted=1):
778             if feed_count < 1:
779                 break
780             elif item.id in feed_items:
781                 feed_count -= 1
782             elif item._channel.url_status != '226':
783                 del(self._items[item.id])
784                 self._expired.append(item)
785                 log.debug("Removed expired or replaced item <%s>", item.id)
786
787     def get_name(self, key):
788         """Return the key containing the name."""
789         for key in ("name", "title"):
790             if self.has_key(key) and self.key_type(key) != self.NULL:
791                 return self.get_as_string(key)
792
793         return ""
794
795 class NewsItem(cache.CachedInfo):
796     """An item of news.
797
798     This class represents a single item of news on a channel.  They're
799     created by members of the Channel class and accessible through it.
800
801     Properties:
802         id              Channel-unique identifier for this item.
803         id_hash         Relatively short, printable cryptographic hash of id
804         date            Corrected UTC-Normalised update time, for sorting.
805         order           Order in which items on the same date can be sorted.
806         hidden          Item should be hidden (True if exists).
807
808         title           One-line title (*).
809         link            Link to the original format text (*).
810         summary         Short first-page summary (*).
811         content         Full HTML content.
812
813         modified        Date the item claims to have been modified (*).
814         issued          Date the item claims to have been issued (*).
815         created         Date the item claims to have been created (*).
816         expired         Date the item claims to expire (*).
817
818         author          Name of the author (*).
819         publisher       Name of the publisher (*).
820         category        Category name (*).
821         comments        Link to a page to enter comments (*).
822         license         Link to the licence for the content (*).
823         source_name     Name of the original source of this item (*).
824         source_link     Link to the original source of this item (*).
825
826     Properties marked (*) will only be present if the original feed
827     contained them.  Note that the various optional date fields are
828     simply claims made by the item and parsed from the information
829     given, 'date' is a far more reliable source of information.
830
831     Some feeds may define additional properties to those above.
832     """
833     IGNORE_KEYS = ("categories", "contributors", "enclosures", "links",
834                    "guidislink", "date", "tags")
835
836     def __init__(self, channel, id_):
837         cache.CachedInfo.__init__(self, channel._cache, id_)
838
839         self._channel = channel
840         self.id = id_
841         self.id_hash = md5.new(id_).hexdigest()
842         self.date = None
843         self.order = None
844         self.content = None
845         self.cache_read()
846
847     def update(self, entry):
848         """Update the item from the feedparser entry given."""
849         for key in entry.keys():
850             if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
851                 # Ignored fields
852                 pass
853             elif entry.has_key(key + "_parsed"):
854                 # Ignore unparsed date fields
855                 pass
856             elif key.endswith("_detail"):
857                 # retain name, email, and language sub-fields
858                 if entry[key].has_key('name') and entry[key].name:
859                     self.set_as_string(key.replace("_detail","_name"), \
860                         entry[key].name)
861                 if entry[key].has_key('email') and entry[key].email:
862                     self.set_as_string(key.replace("_detail","_email"), \
863                         entry[key].email)
864                 if entry[key].has_key('language') and entry[key].language and \
865                    (not self._channel.has_key('language') or \
866                    entry[key].language != self._channel.language):
867                     self.set_as_string(key.replace("_detail","_language"), \
868                         entry[key].language)
869             elif key.endswith("_parsed"):
870                 # Date fields
871                 if entry[key] is not None:
872                     self.set_as_date(key[:-len("_parsed")], entry[key])
873             elif key == "source":
874                 # Source field: save both url and value
875                 if entry[key].has_key("value"):
876                     self.set_as_string(key + "_name", entry[key].value)
877                 if entry[key].has_key("url"):
878                     self.set_as_string(key + "_link", entry[key].url)
879             elif key == "content":
880                 # Content field: concatenate the values
881                 value = ""
882                 for item in entry[key]:
883                     if item.type == 'text/html':
884                         item.value = sanitize.HTML(item.value)
885                     elif item.type == 'text/plain':
886                         item.value = escape(item.value)
887                     if item.has_key('language') and item.language and \
888                        (not self._channel.has_key('language') or
889                        item.language != self._channel.language) :
890                         self.set_as_string(key + "_language", item.language)
891                     value += cache.utf8(item.value)
892                 self.set_as_string(key, value)
893             elif isinstance(entry[key], (str, unicode)):
894                 # String fields
895                 try:
896                     detail = key + '_detail'
897                     if entry.has_key(detail):
898                         if entry[detail].has_key('type'):
899                             if entry[detail].type == 'text/html':
900                                 entry[key] = sanitize.HTML(entry[key])
901                             elif entry[detail].type == 'text/plain':
902                                 entry[key] = escape(entry[key])
903                     self.set_as_string(key, entry[key])
904                 except KeyboardInterrupt:
905                     raise
906                 except:
907                     log.exception("Ignored '%s' of <%s>, unknown format",
908                                   key, self.id)
909
910         # Generate the date field if we need to
911         self.get_date("date")
912
913     def get_date(self, key):
914         """Get (or update) the date key.
915
916         We check whether the date the entry claims to have been changed is
917         since we last updated this feed and when we pulled the feed off the
918         site.
919
920         If it is then it's probably not bogus, and we'll sort accordingly.
921
922         If it isn't then we bound it appropriately, this ensures that
923         entries appear in posting sequence but don't overlap entries
924         added in previous updates and don't creep into the next one.
925         """
926
927         for other_key in ("updated", "modified", "published", "issued", "created"):
928             if self.has_key(other_key):
929                 date = self.get_as_date(other_key)
930                 break
931         else:
932             date = None
933
934         if date is not None:
935             if date > self._channel.updated:
936                 date = self._channel.updated
937 #            elif date < self._channel.last_updated:
938 #                date = self._channel.updated
939         elif self.has_key(key) and self.key_type(key) != self.NULL:
940             return self.get_as_date(key)
941         else:
942             date = self._channel.updated
943
944         self.set_as_date(key, date)
945         return date
946
947     def get_content(self, key):
948         """Return the key containing the content."""
949         for key in ("content", "tagline", "summary"):
950             if self.has_key(key) and self.key_type(key) != self.NULL:
951                 return self.get_as_string(key)
952
953         return ""