""" parser.http.movieParser module (imdb package). This module provides the classes (and the instances), used to parse the IMDb pages on the akas.imdb.com server about a movie. E.g., for Brian De Palma's "The Untouchables", the referred pages would be: combined details: http://akas.imdb.com/title/tt0094226/combined plot summary: http://akas.imdb.com/title/tt0094226/plotsummary ...and so on... Copyright 2004-2007 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re from imdb import imdbURL_base from imdb.Person import Person from imdb.Movie import Movie from imdb.utils import analyze_title from utils import ParserBase, build_person # Dictionary used to convert some section's names. _SECT_CONV = { 'directed': 'director', 'directed by': 'director', 'directors': 'director', 'writing credits': 'writer', 'writers': 'writer', 'produced': 'producer', 'cinematography': 'cinematographer', 'film editing': 'editor', 'casting': 'casting director', 'costume design': 'costume designer', 'makeup department': 'make up', 'production management': 'production manager', 'second unit director or assistant director': 'assistant director', 'costume and wardrobe department': 'costume department', 'sound department': 'sound crew', 'stunts': 'stunt performer', 'other crew': 'miscellaneous crew', 'also known as': 'akas', 'country': 'countries', 'runtime': 'runtimes', 'language': 'languages', 'certification': 'certificates', 'genre': 'genres', 'created': 'creator', 'color': 'color info', 'seasons': 'number of seasons'} # List of allowed sections. _SECT_KEEP = _SECT_CONV.values() + ['cast', 'original music', 'tv series', 'mpaa', 'non-original music', 'art direction', 'set decoration', 'art department', 'special effects', 'visual effects', 'sound mix', 'camera and electrical department', 'plot outline', 'production notes/status', 'production design', 'transportation department', 'editorial department', 'casting department', 'animation department', 'original air date', 'status', 'comments', 'status updated', 'note'] class HTMLMovieParser(ParserBase): """Parser for the "combined details" (and if instance.mdparse is True also for the "main details") page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: mparser = HTMLMovieParser() result = mparser.parse(combined_details_html_string) """ def _reset(self): # If true, we're parsing the "maindetails" page; if false, # the "combined" page is expected. self.mdparse = False self._data = {} # The current section. self._section = u'' # Most of the data are collected in self._cur_txt; some boolean # variable are set to True when we're parsing a significant section. self._cur_txt = u'' self._in_tr = False self._in_info_div = False self._in_li = False # Some variable used to know when to collect data and when to stop. # XXX: things can be made much simpler. self._keep = False self._stop_here = False self._exclude_series = False self._title = u'' # Horrible hack to fix some incongruities of the data. self._in_td = False # Movie status. self._in_production_notes = False self._status_sect = u'' # Last movieID/personID/characterID seen. self._last_movie_id = None self._last_person_id = None self._cids = [] # Counter for the billing position. self._billingPos = 1 # Various information. self._in_h1 = False self._in_h5 = False self._in_b = False self._in_total_episodes = False self._total_episodes = u'' self._in_rating = False self._rating = u'' self._in_top250 = False self._top250 = u'' self._in_poster = False self._in_small = False # Companies information are stored slightly differently. self._in_blackcatheader = False self._in_post_blackcatheader = False def get_data(self): return self._data def start_h1(self, attrs): self._in_h1 = True def end_h1(self): self._in_h1 = False self._title = self._title.strip() seridx = self._title.find(')TV-Series') if seridx != -1: self._data['series years'] = self._title[seridx+10:].lstrip() self._title = self._title[:seridx+1].rstrip() if not self._title: return # The movie's title. self._data.update(analyze_title(self._title, canonical=1)) def _manage_section(self): # Do some transformation on the section name. cs = self._section.strip().lower() # Strip commas and parentheses. if cs[-1:] == ':': cs = cs[:-1].rstrip() if not cs: self._section = u'' return paridx = cs.find('(') if paridx != -1: cs = cs[:paridx].rstrip() cssplit = cs.split() # In tv series, the section name is preceded by 'Series'. if cssplit[0] in ('series', 'episode'): cssplit[:] = cssplit[1:] if cssplit: if cssplit[0] == 'cast': cssplit[:] = ['cast'] elif cssplit[-1] == 'by': cssplit[:] = cssplit[:-1] cs = ' '.join(cssplit) # Convert the section name, if present in _SECT_CONV. cs = _SECT_CONV.get(cs, cs) # Check if this is a section to keep. if cs not in _SECT_KEEP: if cs.endswith('department'): # The IMDb site seems prone to adding 'department' # categories at will. self._section = str(cs) self._keep = True elif not self._in_post_blackcatheader: # This is not a companies information, so it's ok to # discard it. cs = u'' self._keep = False else: # Companies information; do some transformation. if cs == 'special effects': cs = 'special effects companies' elif cs == 'other companies': cs = 'miscellaneous companies' self._section = str(cs) elif cs == 'production notes/status': self._in_production_notes = True self._keep = False else: self._section = str(cs) self._keep = True def start_h5(self, attrs): # Normally section names are enclosed in h5 tags. if self._exclude_series: return self._in_h5 = True self._keep = False self._stop_here = False self._section = u'' self._billingPos = 1 self._last_person_id = None self._cids = [] def end_h5(self): # If self._exclude_series, we're already looking at series-specific # information, while parsing an episode. if self._exclude_series: return self._in_h5 = False self._manage_section() if self.mdparse and self._section in _SECT_KEEP: # Parse also the upper "Directed by" and "Created by", while # httpThin is used (they are the only place these info are). self._in_tr = True start_h3 = start_h5 end_h3 = end_h5 def start_h6(self, attrs): # Production status is in h6 tags. if self._in_production_notes: self._in_h5 = True self.start_h5(attrs) def end_h6(self): if self._in_production_notes: self._in_h5 = False self.end_h5() def do_p(self, attrs): if self._in_production_notes: self._in_production_notes = False def start_div(self, attrs): # Major information sets are enclosed in div tags with class=info. if self._exclude_series: return if self.get_attr_value(attrs, 'class') == 'info': self._in_info_div = True self._cur_txt = u'' def end_div(self): if self._exclude_series: return if not self._keep: return if self._in_info_div: self._add_info() self._in_info_div = False elif self._in_production_notes: # End of 'status note'. self._add_info() self._in_production_notes = False def start_b(self, attrs): # Companies information are stored in section enclosed in b tags # with class=blackcatheader. self._in_b = True self._in_post_blackcatheader = False if self.get_attr_value(attrs, 'class') == 'blackcatheader': self._in_blackcatheader = True self._keep = False self._section = u'' def end_b(self): self._in_b = False if self._in_blackcatheader: self._in_blackcatheader = False self._keep = True self._in_post_blackcatheader = True self._manage_section() def start_li(self, attrs): # Most of companies info are in li tags. self._in_li = True def end_li(self): self._in_li = False if self._in_post_blackcatheader and self._section: self._add_info() def start_small(self, attrs): self._in_small = True def end_small(self): self._in_small = False # Rating and votes. if not self._in_rating: return self._in_rating = False rav = self._rating.strip() if not rav: return i = rav.find('/10') if i != -1: rating = rav[:i] try: rating = float(rating) self._data['rating'] = rating except ValueError: pass i = rav.find('(') if i != -1: votes = rav[i+1:] j = votes.find(' ') votes = votes[:j].replace(',', u'') try: votes = int(votes) self._data['votes'] = votes except ValueError: pass def start_span(self, attrs): pass def end_span(self): # Handle the span for 'status note'. if not self._in_production_notes: return self._add_info() self._in_production_notes = False def _add_info(self): # Used to add information about h5, h6 and b sections. ct = self._cur_txt.strip() if not ct: self._cur_txt = u'' return if self._section in ('director', 'writer'): self._cur_txt = u'' return if self._section in ('status', 'comments', 'status updated', 'note', 'status note'): if not self._section.startswith('status'): self._section = 'status %s' % self._section self._data[self._section] = ct elif self._section == 'plot outline': self._data[self._section] = ct elif self._section == 'mpaa': self._data[self._section] = ct elif self._section == 'number of seasons': self._data[self._section] = ct.count('|') + 1 elif self._section == 'tv series': if self._data.get('kind') == 'episode' and \ self._last_movie_id is not None: m = Movie(title=ct, movieID=self._last_movie_id, accessSystem=self._as, modFunct=self._modFunct) self._data['episode of'] = m self._cur_txt = u'' return elif self._section == 'original air date': aid = self.re_airdate.findall(ct) if aid and len(aid[0]) == 3: date, season, episode = aid[0] date = date.strip() try: season = int(season) except: pass try: episode = int(episode) except: pass if date and date != '????': self._data['original air date'] = date # Handle also "episode 0". if season or type(season) is type(0): self._data['season'] = season if episode or type(season) is type(0): self._data['episode'] = episode elif self._section in ('countries', 'genres', 'languages', 'runtimes', 'color info', 'sound mix', 'certificates'): if self._section == 'runtimes': ct = ct.replace(' min', u'')##.replace(' (', '::(') ##elif self._section == 'certificates': ## ct = ct.replace(' (', '::(') splitted_info = ct.split(' / ') splitted_info[:] = [x.strip() for x in splitted_info] splitted_info[:] = filter(None, splitted_info) splitted_info[:] = [x.replace(' (', '::(', 1) for x in splitted_info] if not self._data.has_key(self._section): self._data[self._section] = splitted_info elif self._section == 'miscellaneous companies': self._data.setdefault(self._section, []).append(ct.replace(' ', '::', 1)) elif self._section != 'cast': self._data.setdefault(self._section, []).append(ct) self._cur_txt = u'' def do_br(self, attrs): # Do some transformation on akas. if not self._keep: return if self._section == 'akas': self._cur_txt = self._cur_txt.replace(' ', ' ') self._cur_txt = self._cur_txt.replace(' ', ' ') self._cur_txt = self._cur_txt.replace(' (', '::(', 1) self._cur_txt = self._cur_txt.replace(' [', '::[', 1) self._add_info() elif self._in_production_notes: self._add_info() self._keep = False if self.mdparse and self._section in _SECT_KEEP: if self._cur_txt.endswith('...'): self._cur_txt = self._cur_txt[:-3] self.end_tr() self._in_tr = True def start_tr(self, attrs): self._in_tr = True def end_tr(self): # Add cast/roles information. self._in_tr = False ct = self._cur_txt = self._cur_txt.strip() if not self._keep: self._cur_txt = u'' return if self._last_person_id is None: self._cur_txt = u'' return if not ct: return if ct[0] == '(' and ct[-1] == ')': self._cur_txt = u'' return if self._section == 'cast' and ct.startswith('rest of cast listed'): self._cur_txt = u'' return cids = self._cids if not cids: cids = None elif len(cids) == 1: cids = cids[0] p = build_person(ct, personID=self._last_person_id, billingPos=self._billingPos, roleID=cids, accessSystem=self._as, modFunct=self._modFunct) self._data.setdefault(self._section, []).append(p) self._billingPos += 1 self._cur_txt = u'' self._last_person_id = None self._cids = [] def start_td(self, attrs): if not (self._keep and self._cur_txt and self._last_person_id): return self._in_td = True def end_td(self): self._in_td = False def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if self.get_attr_value(attrs, 'title') == 'Full Episode List': self._in_total_episodes = True elif href and href.startswith('/chart/top?tt'): self._in_top250 = True elif self.get_attr_value(attrs, 'name') == 'poster': self._in_poster = True # From here on, we're inside some kind of information and a href. if not (self._keep and href): return # Hack! Keep in mind, if it will ever be needed to know if # we're inside a td tag; that's here to prevent lines like: # Person Name # to trigger some code in the _handle_data method. self._in_td = False # Collect personID, movieID and characterID. if href.startswith('/name/nm'): cur_id = self.re_imdbID.findall(href) if cur_id: self._last_person_id = cur_id[-1] return elif self._data.get('kind') == 'episode' and \ href.startswith('/title/tt'): cur_id = self.re_imdbID.findall(href) if cur_id: self._last_movie_id = cur_id[-1] elif href.startswith('/character/ch'): cur_id = self.re_imdbID.findall(href) if cur_id: lid = cur_id[-1] self._cids.append(lid) elif self.mdparse and href.startswith('fullcredits#'): # The "more" link at the end of the cast. self._in_tr = False if self._in_info_div: # The various "more" links. cls = self.get_attr_value(attrs, 'class') if cls and cls.startswith('tn15more'): self._stop_here = True def end_a(self): if self._in_total_episodes: self._in_total_episodes = False try: te = int(self._total_episodes.strip().split()[0]) self._data['number of episodes'] = te except: pass self._total_episodes = u'' elif self._in_top250: self._in_top250 = False self._top250 = self._top250.strip() posidx = self._top250.find('#') if posidx != -1: top250 = self._top250[posidx+1:] try: self._data['top 250 rank'] = int(top250) except: pass elif self._in_poster: self._in_poster = False def do_img(self, attrs): if self._in_poster: src = self.get_attr_value(attrs, 'src') if src: self._data['cover url'] = src # For some funny reason the cast section is tagged by an image. alt = self.get_attr_value(attrs, 'alt') if alt and alt.lower() == 'cast': self._section = 'cast' self._manage_section() def _handle_data(self, data): if self._in_h5 or self._in_blackcatheader: # Section's name. self._section += data elif self._in_h1 and not self._in_small: self._title += data elif self._in_rating: self._rating += data if self._in_b: sldata = data.strip().lower() if sldata == 'user rating:': self._in_rating = True elif sldata == 'series crew': self._exclude_series = True elif self._in_total_episodes: self._total_episodes += data elif self._in_top250: self._top250 += data if self._stop_here or self._exclude_series or not self._keep: return # Collect the data. if self._in_tr or self._in_info_div or self._in_li or \ self._in_production_notes: if self._in_td: if data == ' ': self._cur_txt += ' ....' if self._section == 'cast': nrSep = data.count(' / ') if nrSep > 0: sdata = data.strip() if sdata == '/' or (sdata.endswith(' /') and sdata.startswith('/ ')): nrSep -= 1 self._cids += [None]*nrSep self._cur_txt += data class HTMLPlotParser(ParserBase): """Parser for the "plot summary" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a 'plot' key, containing a list of string with the structure: 'summary_author ::summary'. Example: pparser = HTMLPlotParser() result = pparser.parse(plot_summary_html_string) """ _defGetRefs = True def _init(self): self._plot_data = {} def _reset(self): """Reset the parser.""" self._plot_data.clear() self._is_plot = 0 self._stop_plot = 0 self._plot = u'' self._is_plot_writer = 0 self._plot_writer = u'' def get_data(self): """Return the dictionary with the 'plot' key.""" return self._plot_data def start_p(self, attrs): pclass = self.get_attr_value(attrs, 'class') if pclass and pclass.lower() == 'plotpar': self._is_plot = 1 self._stop_plot = 0 def end_p(self): if not self._is_plot: return plot = self._plot.strip() writer = self._plot_writer.strip() if plot: # Replace funny email separators. writer = writer.replace('{', '<').replace('}', '>') txt = plot if writer: txt = writer + '::' + plot self._plot_data.setdefault('plot', []).append(txt) self._is_plot = 0 self._plot_writer = u'' self._plot = u'' def start_a(self, attrs): if not self._is_plot: return link = self.get_attr_value(attrs, 'href') # The next data is the name of the author. if link and link.lower().startswith('/searchplotwriters'): self._is_plot_writer = 1 self._stop_plot = 1 def end_a(self): if self._is_plot_writer: self._is_plot_writer = 0 def start_i(self, attrs): if self._is_plot: self._stop_plot = 1 def end_i(self): pass def _handle_data(self, data): # Store text for plots and authors. if self._is_plot and not self._stop_plot: self._plot += data if self._is_plot_writer: self._plot_writer += data class HTMLAwardsParser(ParserBase): """Parser for the "awards" page of a given person or movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: awparser = HTMLAwardsParser() result = awparser.parse(awards_html_string) """ def _init(self): self._aw_data = [] # Are we managing awards for a person or a movie? self.subject = 'title' def _reset(self): """Reset the parser.""" self._aw_data = [] self._is_big = 0 self._is_small = 0 self._is_current_assigner = 0 self._begin_aw = 0 self._in_td = 0 self._cur_year = u'' self._cur_result = u'' self._cur_notes = u'' self._cur_category = u'' self._cur_forto = u'' self._cur_assigner = u'' self._cur_award = u'' self._cur_sect = u'' self._no = 0 self._rowspan = 0 self._counter = 1 self._limit = 1 self._is_tn = 0 self._cur_id = u'' self._t_o_n = u'' self._to = [] self._for = [] self._with = [] self._begin_to_for = 0 self._cur_role = u'' self._cur_tn = u'' # XXX: a Person or Movie object is instantiated only once (i.e.: # every reference to a given movie/person is the _same_ # object). self._person_obj_list = [] self._movie_obj_list = [] def get_data(self): """Return the dictionary.""" if not self._aw_data: return {} return {'awards': self._aw_data} def start_big(self, attrs): self._is_big = 1 def end_big(self): self._is_big = 0 def start_td(self, attrs): self._in_td = 1 if not self._begin_aw: return rowspan = self.get_attr_value(attrs, 'rowspan') or '1' try: rowspan = int(rowspan) except (ValueError, OverflowError): rowspan = 1 self._rowspan = rowspan colspan = self.get_attr_value(attrs, 'colspan') or '1' try: colspan = int(colspan) except (ValueError, OverflowError): colspan = 1 if colspan == 4: self._no = 1 def end_td(self): if self._no or not self._begin_aw: return if self._cur_sect == 'year': self._cur_sect = 'res' elif self._cur_sect == 'res': self._limit = self._rowspan self._counter = 1 self._cur_sect = 'award' elif self._cur_sect == 'award': self._cur_sect = 'cat' elif self._cur_sect == 'cat': self._counter += 1 self.store_data() self._begin_to_for = 0 # XXX: if present, the next "Category/Recipient(s)" # has a different "Result", so go back and read it. if self._counter == self._limit+1: self._cur_result = u'' self._cur_award = u'' self._cur_sect = 'res' self._counter = 1 def store_data(self): year = self._cur_year.strip() res = self._cur_result.strip() aw = self._cur_award.strip() notes = self._cur_notes.strip() assign = self._cur_assigner.strip() cat = self._cur_category.strip() d = {'year': year, 'result': res, 'award': aw, 'notes': notes, 'assigner': assign, 'category': cat, 'for': self._for, 'to': self._to, 'with': self._with} # Remove empty keys. for key in d.keys(): if not d[key]: del d[key] self._aw_data.append(d) self._cur_notes = u'' self._cur_category = u'' self._cur_forto = u'' self._with = [] self._to = [] self._for = [] self._cur_role = u'' def start_th(self, attrs): self._begin_aw = 0 def end_th(self): pass def start_table(self, attrs): pass def end_table(self): self._begin_aw = 0 self._in_td = 0 def start_small(self, attrs): self._is_small = 1 def end_small(self): self._is_small = 0 def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if not href: return if href.startswith('/Sections/Awards'): if self._in_td: try: year = unicode(int(href[-4:])) except (ValueError, TypeError): year = None if year: self._cur_sect = 'year' self._cur_year = year self._begin_aw = 1 self._counter = 1 self._limit = 1 self._no = 0 self._cur_result = u'' self._cur_notes = u'' self._cur_category = u'' self._cur_forto = u'' self._cur_award = u'' self._with = [] self._to = [] self._for = [] if self._is_big: self._is_current_assigner = 1 self._cur_assigner = u'' elif href.startswith('/name') or href.startswith('/title'): if self._is_small: return tn = self.re_imdbID.findall(href) if tn: self._cur_id = tn[-1] self._is_tn = 1 self._cur_tn = u'' if href.startswith('/name'): self._t_o_n = 'n' else: self._t_o_n = 't' def end_a(self): if self._is_current_assigner: self._is_current_assigner = 0 if self._is_tn and self._cur_sect == 'cat': self._cur_tn = self._cur_tn.strip() self._cur_role = self._cur_role.strip() if self.subject == 'name': if self._t_o_n == 't': self._begin_to_for = 1 m = Movie(title=self._cur_tn, movieID=str(self._cur_id), accessSystem=self._as, modFunct=self._modFunct) if m in self._movie_obj_list: ind = self._movie_obj_list.index(m) m = self._movie_obj_list[ind] else: self._movie_obj_list.append(m) self._for.append(m) else: p = Person(name=self._cur_tn, personID=str(self._cur_id), currentRole=self._cur_role, accessSystem=self._as, modFunct=self._modFunct) if p in self._person_obj_list: ind = self._person_obj_list.index(p) p = self._person_obj_list[ind] else: self._person_obj_list.append(p) self._with.append(p) else: if self._t_o_n == 't': m = Movie(title=self._cur_tn, movieID=str(self._cur_id), accessSystem=self._as, modFunct=self._modFunct) if m in self._movie_obj_list: ind = self._movie_obj_list.index(m) m = self._movie_obj_list[ind] else: self._movie_obj_list.append(m) self._with.append(m) else: self._begin_to_for = 1 p = Person(name=self._cur_tn, personID=str(self._cur_id), currentRole=self._cur_role, accessSystem=self._as, modFunct=self._modFunct) if p in self._person_obj_list: ind = self._person_obj_list.index(p) p = self._person_obj_list[ind] else: self._person_obj_list.append(p) self._to.append(p) self._cur_role = u'' self._is_tn = 0 def _handle_data(self, data): if self._is_current_assigner: self._cur_assigner += data if not self._begin_aw or not data or data.isspace() or self._no: return sdata = data.strip() sldata = sdata.lower() if self._cur_sect == 'res': self._cur_result += data elif self._cur_sect == 'award': self._cur_award += data elif self._cur_sect == 'cat': if self._is_tn: self._cur_tn += data elif sldata not in ('for:', 'shared with:'): if self._is_small: self._cur_notes += data elif self._begin_to_for: self._cur_role += data else: self._cur_category += data class HTMLTaglinesParser(ParserBase): """Parser for the "taglines" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = HTMLTaglinesParser() result = tparser.parse(taglines_html_string) """ def _reset(self): """Reset the parser.""" self._in_tl = 0 self._in_h1 = 0 self._in_tlu2 = 0 self._tl = [] self._ctl = u'' self._seen_left_div = 0 def get_data(self): """Return the dictionary.""" self._tl[:] = [x.strip() for x in self._tl] self._tl[:] = filter(None, self._tl) if not self._tl: return {} return {'taglines': self._tl} def start_h1(self, attrs): self._in_h1 = 1 def end_h1(self): self._in_h1 = 0 if self._in_tlu2: self._in_tl = 1 def _end_content(self): self._in_tl = 1 def start_div(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls and cls.strip().lower() == 'left': self._seen_left_div = 1 def end_div(self): pass def start_table(self, attrs): pass def end_table(self): self._ctl = u'' def start_p(self, attrs): pass def end_p(self): if self._in_tl and self._ctl and not self._seen_left_div: self._tl.append(self._ctl.strip()) self._ctl = u'' def _handle_data(self, data): if self._in_tl and self._in_content: self._ctl += data elif self._in_h1 and data.lower().find('taglines for') != -1: self._in_tlu2 = 1 class HTMLKeywordsParser(ParserBase): """Parser for the "keywords" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: kwparser = HTMLKeywordsParser() result = kwparser.parse(keywords_html_string) """ def _reset(self): """Reset the parser.""" self._in_kw = 0 self._kw = [] self._ckw = u'' def get_data(self): """Return the dictionary.""" if not self._kw: return {} return {'keywords': self._kw} def start_b(self, attrs): if self.get_attr_value(attrs, 'class') == 'keyword': self._in_kw = 1 def end_b(self): if self._in_kw: self._kw.append(self._ckw.strip()) self._ckw = u'' self._in_kw = 0 def start_a(self, attrs): if not self._in_kw: return href = self.get_attr_value(attrs, 'href') if not href: return kwi = href.find('keyword/') if kwi == -1: return kw = href[kwi+8:].strip() if not kw: return if kw[-1] == '/': kw = kw[:-1].strip() if kw: self._ckw = kw def end_a(self): pass class HTMLAlternateVersionsParser(ParserBase): """Parser for the "alternate versions" and "trivia" pages of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: avparser = HTMLAlternateVersionsParser() result = avparser.parse(alternateversions_html_string) """ _defGetRefs = True def _init(self): self.kind = 'alternate versions' def _reset(self): """Reset the parser.""" self._in_av = 0 self._in_avd = 0 self._av = [] self._cav = u'' self._stlist = [] self._curst = {} self._cur_title = u'' self._curinfo = u'' def get_data(self): """Return the dictionary.""" if self.kind == 'soundtrack': if self._stlist: return {self.kind: self._stlist} else: return {} if not self._av: return {} return {self.kind: self._av} def start_ul(self, attrs): if self.get_attr_value(attrs, 'class') == 'trivia': self._in_av = 1 def end_ul(self): self._in_av = 0 def start_li(self, attrs): if self._in_av: self._in_avd = 1 def end_li(self): if self._in_av and self._in_avd: if self.kind == 'soundtrack': self._stlist.append(self._curst.copy()) self._curst.clear() self._cur_title = u'' self._curinfo = u'' else: self._av.append(self._cav.strip()) self._in_avd = 0 self._cav = u'' def do_br(self, attrs): if self._in_avd and self.kind == 'soundtrack': if not self._cur_title: self._cav = self._cav.strip() if self._cav and self._cav[-1] == '"': self._cav = self._cav[:-1] if self._cav and self._cav[0] == '"': self._cav = self._cav[1:] self._cur_title = self._cav self._curst[self._cur_title] = {} self._cav = u'' else: lcw = self._cav.lower() for i in ('with', 'by', 'from', 'of'): posi = lcw.find(i) if posi != -1: self._curinfo = self._cav[:posi+len(i)] if self.kind == 'soundtrack': self._curinfo = self._curinfo.lower().strip() rest = self._cav[posi+len(i)+1:] self._curst[self._cur_title][self._curinfo] = \ rest break else: if not lcw.strip(): return if not self._curst[self._cur_title].has_key('misc'): self._curst[self._cur_title]['misc'] = u'' if self._curst[self._cur_title]['misc'] and \ self._curst[self._cur_title]['misc'][-1] != ' ': self._curst[self._cur_title]['misc'] += ' ' self._curst[self._cur_title]['misc'] += self._cav self._cav = u'' def _handle_data(self, data): if self._in_avd: self._cav += data class HTMLCrazyCreditsParser(ParserBase): """Parser for the "crazy credits" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: ccparser = HTMLCrazyCreditsParser() result = ccparser.parse(crazycredits_html_string) """ _defGetRefs = True def _reset(self): """Reset the parser.""" self._cc = [] self._in_cc = False self._ccc = u'' def get_data(self): """Return the dictionary.""" if not self._cc: return {} return {'crazy credits': self._cc} def start_ul(self, attrs): if self._in_content: self._in_cc = True def end_ul(self): self._in_cc = False def do_br(self, attrs): if not self._in_cc: return if self._ccc: self._ccc += u' ' def start_li(self, attrs): self._ccc = u'' def end_li(self): if not self._in_cc: return self._ccc = self._ccc.strip() if self._ccc: self._ccc = self._ccc.replace('\n', ' ').replace(' ', ' ') self._cc.append(self._ccc) self._ccc = u'' def _handle_data(self, data): if self._in_cc: self._ccc += data class HTMLGoofsParser(ParserBase): """Parser for the "goofs" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = HTMLGoofsParser() result = gparser.parse(goofs_html_string) """ _defGetRefs = True def _reset(self): """Reset the parser.""" self._in_go = 0 self._in_go2 = 0 self._go = [] self._cgo = u'' self._in_gok = 0 self._cgok = u'' def get_data(self): """Return the dictionary.""" if not self._go: return {} return {'goofs': self._go} def start_ul(self, attrs): if self.get_attr_value(attrs, 'class') == 'trivia': self._in_go = 1 def end_ul(self): self._in_go = 0 def start_b(self, attrs): if self._in_go2: self._in_gok = 1 def end_b(self): self._in_gok = 0 def start_li(self, attrs): if self._in_go: self._in_go2 = 1 def end_li(self): if self._in_go and self._in_go2: self._in_go2 = 0 self._go.append('%s:%s' % (self._cgok.strip().lower(), self._cgo.strip())) self._cgo = u'' self._cgok = u'' def _handle_data(self, data): if self._in_gok: self._cgok += data elif self._in_go2: self._cgo += data class HTMLQuotesParser(ParserBase): """Parser for the "memorable quotes" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = HTMLQuotesParser() result = qparser.parse(quotes_html_string) """ _defGetRefs = True def _reset(self): """Reset the parser.""" self._in_quo2 = 0 self._quo = [] self._cquo = u'' def get_data(self): """Return the dictionary.""" if not self._quo: return {} quo = [] for q in self._quo: if q.endswith('::'): q = q[:-2] quo.append(q) return {'quotes': quo} def start_a(self, attrs): if not self._in_content: return name = self.get_attr_value(attrs, 'name') if name and name.startswith('qt'): self._in_quo2 = 1 def end_a(self): pass def start_h3(self, attrs): self._in_quo2 = 0 self._cquo = u'' def end_h3(self): pass def do_hr(self, attrs): if self._in_content and self._in_quo2 and self._cquo: self._cquo = self._cquo.strip() if self._cquo.endswith('::'): self._cquo = self._cquo[:-2] self._quo.append(self._cquo.strip()) self._cquo = u'' def start_div(self, attrs): if self._in_content and self._in_quo2: self.do_hr([]) self._in_content = 0 def end_div(self): pass def start_img(self, attrs): self._in_quo2 = 0 def end_img(self): pass def do_br(self, attrs): if self._in_content and self._in_quo2 and self._cquo: self._cquo = '%s::' % self._cquo.strip() def _handle_data(self, data): if self._in_content and self._in_quo2: data = data.replace('\n', ' ') if self._cquo.endswith('::'): data = data.lstrip() self._cquo += data class HTMLReleaseinfoParser(ParserBase): """Parser for the "release dates" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rdparser = HTMLReleaseinfoParser() result = rdparser.parse(releaseinfo_html_string) """ def _reset(self): """Reset the parser.""" self._in_rl = 0 self._in_rl2 = 0 self._rl = [] self._crl = u'' self._is_country = 0 def get_data(self): """Return the dictionary.""" if not self._rl: return {} return {'release dates': self._rl} def start_th(self, attrs): if self.get_attr_value(attrs, 'class') == 'xxxx': self._in_rl = 1 def end_th(self): pass def start_a(self, attrs): if self._in_rl: href = self.get_attr_value(attrs, 'href') if href and href.startswith('/Recent'): self._in_rl2 = 1 self._is_country = 1 def end_a(self): if self._is_country: if self._crl: self._crl += '::' self._is_country = 0 def start_tr(self, attrs): pass def end_tr(self): if self._in_rl2: self._in_rl2 = 0 self._rl.append(self._crl) self._crl = u'' def _handle_data(self, data): if self._in_rl2: if self._crl and self._crl[-1] not in (' ', ':') \ and not data.isspace(): self._crl += ' ' self._crl += data.strip() class HTMLRatingsParser(ParserBase): """Parser for the "user ratings" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rparser = HTMLRatingsParser() result = rparser.parse(userratings_html_string) """ re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])', re.I) def _reset(self): """Reset the parser.""" self._in_t = 0 self._in_total = 0 self._in_b = 0 self._cur_nr = u'' self._in_cur_vote = 0 self._cur_vote = u'' self._in_weighted = 0 self._weighted = None self._first = 0 self._votes = {} self._rank = {} self._demo = {} self._in_p = 0 self._in_demo = 0 self._in_demo_t = 0 self._cur_demo_t = u'' self._cur_demo_av = u'' self._next_is_demo_vote = 0 self._next_demo_vote = u'' self._in_td = 0 def get_data(self): """Return the dictionary.""" data = {} if self._votes: data['number of votes'] = self._votes if self._demo: data['demographic'] = self._demo tot_votes = self._demo.get('all votes') if tot_votes: data['votes'] = tot_votes[0] if self._weighted is not None: data['rating'] = self._weighted data.update(self._rank) return data def start_table(self, attrs): self._in_t = 1 def end_table(self): self._in_t = 0 self._in_total = 0 def start_b(self, attrs): self._in_b = 1 def end_b(self): self._in_b = 0 def start_td(self, attrs): self._in_td = 1 def end_td(self): self._in_td = 0 if self._in_total: if self._first: self._first = 0 def start_tr(self, attrs): if self._in_total: self._first = 1 def end_tr(self): if self._in_total: if self._cur_nr: try: c = int(self._cur_vote) n = int(self._cur_nr.replace(',', '')) self._votes[c] = n except (ValueError, OverflowError): pass self._cur_nr = u'' self._cur_vote = u'' if self._in_demo: self._in_demo = 0 try: av = float(self._cur_demo_av) dv = int(self._next_demo_vote.replace(',', '')) self._demo[self._cur_demo_t] = (dv, av) except (ValueError, OverflowError): pass self._cur_demo_av = u'' self._next_demo_vote = u'' self._cur_demo_t = u'' def start_p(self, attrs): self._in_p = 1 def end_p(self): self._in_p = 0 def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if href: if href.startswith('ratings-'): self._in_demo = 1 self._in_demo_t = 1 elif href.startswith('/List?ratings='): self._in_weighted = 1 def end_a(self): self._in_demo_t = 0 self._in_weighted = 0 def _handle_data(self, data): if self._in_b and data == 'Rating': self._in_total = 1 sdata = data.strip() if not sdata: return if self._first: self._cur_nr = sdata else: self._cur_vote = sdata if self._in_p: if self._in_weighted: try: # The 'weighted average' is the usual rating. self._weighted = float(data.strip()) except (ValueError, OverflowError): pass elif sdata.startswith('Ranked #'): sd = sdata[8:] i = sd.find(' ') if i != -1: sd = sd[:i] try: sd = int(sd) except (ValueError, OverflowError): pass if type(sd) is type(0): self._rank['top 250 rank'] = sd elif sdata.startswith('Arithmetic mean = '): means = self.re_means.findall(sdata) if means and len(means[0]) == 2: am, med = means[0] try: am = float(am) except (ValueError, OverflowError): pass if type(am) is type(1.0): self._rank['arithmetic mean'] = am try: med = int(med) except (ValueError, OverflowError): pass if type(med) is type(0): self._rank['median'] = med if self._in_demo: if self._next_is_demo_vote: self._next_demo_vote = sdata self._next_is_demo_vote = 0 elif self._in_demo_t: self._cur_demo_t = sdata.lower() self._next_is_demo_vote = 1 else: self._cur_demo_av = sdata elif self._in_td and sdata.startswith('All votes'): self._in_demo = 1 self._next_is_demo_vote = 1 self._cur_demo_t = 'all votes' class HTMLEpisodesRatings(ParserBase): """Parser for the "episode ratings ... by date" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: erparser = HTMLEpisodesRatings() result = erparser.parse(eprating_html_string) """ def _reset(self): """Reset the parser.""" self._res = [] self._cur_data = {} self._in_h4 = 0 self._in_rating = 0 self._cur_info = 'season.episode' self._cur_info_txt = u'' self._cur_id = u'' self._in_title = 0 self._series_title = u'' self._series_obj = None self._in_td = 0 def get_data(self): """Return the dictionary.""" if not self._res: return {} return {'episodes rating': self._res} def start_title(self, attrs): self._in_title = 1 def end_title(self): self._in_title = 0 self._series_title = self._series_title.strip() if self._series_title: self._series_obj = Movie(title=self._series_title, accessSystem=self._as, modFunct=self._modFunct) def start_h4(self, attrs): self._in_h4 = 1 def end_h4(self): self._in_h4 = 0 def start_table(self, attrs): pass def end_table(self): self._in_rating = 0 def start_tr(self, attrs): if self._in_rating: self._cur_info = 'season.episode' self._cur_info_txt = u'' def end_tr(self): if not self._in_rating: return if self._series_obj is None: return if self._cur_data and self._cur_id: ep_title = self._series_title ep_title += 'u {%s' % self._cur_data['episode'] if self._cur_data.has_key('season.episode'): ep_title += ' (#%s)' % self._cur_data['season.episode'] del self._cur_data['season.episode'] ep_title += '}' m = Movie(title=ep_title, movieID=self._cur_id, accessSystem=self._as, modFunct=self._modFunct) m['episode of'] = self._series_obj self._cur_data['episode'] = m if self._cur_data.has_key('rating'): try: self._cur_data['rating'] = float(self._cur_data['rating']) except ValueError: pass if self._cur_data.has_key('votes'): try: self._cur_data['votes'] = int(self._cur_data['votes']) except (ValueError, OverflowError): pass self._res.append(self._cur_data) self._cur_data = {} self._cur_id = u'' def start_td(self, attrs): self._in_td = 1 def end_td(self): self._in_td = 0 if not self._in_rating: return self._cur_info_txt = self._cur_info_txt.strip() if self._cur_info_txt: self._cur_data[self._cur_info] = self._cur_info_txt self._cur_info_txt = u'' if self._cur_info == 'season.episode': self._cur_info = 'episode' elif self._cur_info == 'episode': self._cur_info = 'rating' elif self._cur_info == 'rating': self._cur_info = 'votes' elif self._cur_info == 'votes': self._cur_info = 'season.episode' def start_a(self, attrs): if not (self._in_rating and self._cur_info == 'episode'): return href = self.get_attr_value(attrs, 'href') if not href: return curid = self.re_imdbID.findall(href) if not curid: return self._cur_id = curid[-1] def end_a(self): pass def _handle_data(self, data): if self._in_rating and self._in_td: self._cur_info_txt += data if self._in_h4 and data.strip().lower() == 'rated episodes': self._in_rating = 1 if self._in_title: self._series_title += data class HTMLOfficialsitesParser(ParserBase): """Parser for the "official sites", "external reviews", "newsgroup reviews", "miscellaneous links", "sound clips", "video clips" and "photographs" pages of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: osparser = HTMLOfficialsitesParser() result = osparser.parse(officialsites_html_string) """ def _init(self): self.kind = 'official sites' def _reset(self): """Reset the parser.""" self._in_os2 = 0 self._in_os3 = 0 self._os = [] self._cos = u'' self._cosl = u'' def get_data(self): """Return the dictionary.""" if not self._os: return {} return {self.kind: self._os} def start_ol(self, attrs): if self._in_content: self._in_os2 = 1 def end_ol(self): if self._in_os2: self._in_os2 = 0 def start_li(self, attrs): if self._in_os2: self._in_os3 = 1 def end_li(self): if self._in_os3: self._in_os3 = 0 if self._cosl and self._cos: self._os.append((self._cos.strip(), self._cosl.strip())) self._cosl = u'' self._cos = u'' def start_a(self, attrs): if self._in_os3: href = self.get_attr_value(attrs, 'href') if href: if not href.lower().startswith('http://'): if href.startswith('/'): href = href[1:] href = '%s%s' % (imdbURL_base, href) self._cosl = href def end_a(self): pass def _handle_data(self, data): if self._in_os3: self._cos += data class HTMLConnectionParser(ParserBase): """Parser for the "connections" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: connparser = HTMLConnectionParser() result = connparser.parse(connections_html_string) """ def _reset(self): """Reset the parser.""" self._connections = {} self._in_conn_type = False self._conn_type = u'' self._in_cur_title = False self._cur_title = u'' self._cur_id = u'' self._cur_note = u'' self._seen_br = False self._stop = False def get_data(self): """Return the dictionary.""" if not self._connections: return {} return {'connections': self._connections} def start_h5(self, attrs): if not self._in_content: return self._add_info() self._in_conn_type = True self._conn_type = u'' def end_h5(self): self._conn_type = self._conn_type.strip().lower() self._in_conn_type = False def start_div(self, attrs): pass def end_div(self): if self._stop: return self._add_info() def do_img(self, attrs): src = self.get_attr_value(attrs, 'src') if src and src.find('header_relatedlinks') != -1: self._stop = True def start_a(self, attrs): if not self._in_content: return href = self.get_attr_value(attrs, 'href') self._add_info() if not href: return imdbID = self.re_imdbID.findall(href) if imdbID: self._cur_id = str(imdbID[0]) self._in_cur_title = True def end_a(self): pass def do_br(self, attrs): if not self._in_content: return self._seen_br = True def _add_info(self): self._cur_title = self._cur_title.strip() self._cur_note = self._cur_note.strip() if self._cur_title and self._cur_id and self._conn_type: if self._cur_note and self._cur_note[0] == '-': self._cur_note = self._cur_note[1:].lstrip() m = Movie(movieID=str(self._cur_id), title=self._cur_title, accessSystem=self._as, notes=self._cur_note, modFunct=self._modFunct) self._connections.setdefault(self._conn_type, []).append(m) self._in_cur_title = False self._cur_title = u'' self._cur_id = u'' self._cur_note = u'' self._seen_br = False self._in_cur_title = False def _handle_data(self, data): if not self._in_content: return if self._stop: return if self._in_conn_type: self._conn_type += data elif self._in_cur_title: if self._seen_br: self._cur_note += data else: self._cur_title += data class HTMLLocationsParser(ParserBase): """Parser for the "locations" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: lparser = HTMLLocationsParser() result = lparser.parse(locations_html_string) """ def _reset(self): """Reset the parser.""" self._tc = {} self._dotc = 0 self._indt = 0 self._indd = 0 self._cur_sect = u'' self._curdata = [u''] self._cur_data = u'' self._locations = [] def get_data(self): """Return the dictionary.""" rl = self._locations rl[:] = [x.replace(':: ', '::').replace(' ::', '::') for x in rl] if rl: return {'locations': rl} return {} def start_dl(self, attrs): self._dotc = 1 def end_dl(self): self._dotc = 0 self._cur_data = self._cur_data.strip().strip(':').strip() if self._cur_data: self._locations.append(self._cur_data) def start_dt(self, attrs): self._cur_data = self._cur_data.strip().strip(':').strip() if self._cur_data: self._locations.append(self._cur_data) self._cur_data = u'' if self._dotc: self._indt = 1 def end_dt(self): self._indt = 0 def start_dd(self, attrs): if self._dotc: self._indd = 1 self._cur_data = self._cur_data.strip() if self._cur_data: if self._cur_data[-2:] != '::': self._cur_data += '::' def end_dd(self): pass def do_br(self, attrs): if self._indd: self._cur_data = self._cur_data.strip() ##if self._cur_data and 0: ## if self._cur_data[-2:] != '::': ## self._cur_data += '::' self._curdata += [u''] def _handle_data(self, data): if self._indd or self._indt: self._cur_data += data class HTMLTechParser(ParserBase): """Parser for the "technical", "business", "literature", "publicity" (for people) pages of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = HTMLTechParser() result = tparser.parse(technical_html_string) """ def _init(self): self.kind = 'tech' def _reset(self): """Reset the parser.""" self._tc = {} self._in_sect_title = 0 self._in_data = 0 self._cur_sect = u'' self._curdata = [u''] self._stop_collecting = False def get_data(self): """Return the dictionary.""" if self.kind in ('literature', 'business', 'contacts') and self._tc: return {self.kind: self._tc} return self._tc def _end_content(self): self._add_entry() def start_h5(self, attrs): if self._in_content: self._add_entry() self._in_sect_title = 1 def end_h5(self): self._in_sect_title = 0 self._in_data = 1 def start_div(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls and cls == 'left': self._stop_collecting = True def end_div(self): pass def start_tr(self, attrs): pass def end_tr(self): if self._in_data and self.kind == 'publicity': if self._curdata: self.do_br([]) def start_td(self, attrs): pass def end_td(self): if self._in_data and self._curdata and self.kind == 'publicity': if self._curdata[-1].find('::') == -1: self._curdata[-1] += '::' def start_p(self, attrs): pass def end_p(self): if self._in_data and self.kind == 'publicity': if self._curdata: self._curdata[-1] += '::' self.do_br([]) def do_hr(self, attrs): self._stop_collecting = True def start_form(self, attrs): if self._in_data and self.kind == 'contacts': self._stop_collecting = True self._add_entry() def end_form(self): pass def _add_entry(self): self._curdata = [x.strip(':').strip() for x in self._curdata] self._curdata = filter(None, self._curdata) if self._cur_sect and self._curdata: self._tc[self._cur_sect] = self._curdata[:] self._curdata[:] = [u''] self._cur_sect = u'' self._in_data = 0 def do_br(self, attrs): if self._in_data: self._curdata += [u''] def _handle_data(self, data): if self._stop_collecting: return if self._in_data: self._curdata[-1] += data elif self._in_sect_title: data = data.lower() self._cur_sect += data class HTMLDvdParser(ParserBase): """Parser for the "dvd" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: dparser = HTMLDvdParser() result = dparser.parse(dvd_html_string) """ _defGetRefs = True def _init(self): self._dvd = [] def _reset(self): """Reset the parser.""" self._cdvd = {} self._indvd = 0 self._cur_sect = u'' self._cur_data = u'' self._insect = 0 self._intitle = 0 self._cur_title = u'' self._seencover = 0 def get_data(self): """Return the dictionary.""" if self._dvd: return {'dvd': self._dvd} return {} def start_table(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls == 'dvd_section': self._indvd = 1 self._seencover = 0 self._processDataSet() def end_table(self): if self._indvd: self._processInfo() else: self._processDataSet() def do_hr(self, attrs): if not self._indvd: return self._processDataSet() def _processDataSet(self): self._processInfo() self._cur_title = self._cur_title.strip() if self._cdvd and self._cur_title: self._cdvd['title'] = self._cur_title self._dvd.append(self._cdvd) self._cdvd = {} self._cur_title = u'' def _processInfo(self): self._cur_sect = self._cur_sect.replace(':', u'').strip().lower() self._cur_data = self._cur_data.strip() if self._cur_sect and self._cur_data: self._cdvd[self._cur_sect] = self._cur_data self._cur_sect = u'' self._cur_data = u'' def do_img(self, attrs): if not self._indvd: return alt = self.get_attr_value(attrs, 'alt') if alt and alt.startswith('Rating: '): rating = alt[8:].strip() if rating: self._cdvd['rating'] = rating elif alt and not self._seencover: self._seencover = 1 src = self.get_attr_value(attrs, 'src') if src and src.find('noposter') == -1: if src[0] == '/': src = '%s%s' % (imdbURL_base, src[1:]) self._cdvd['cover'] = src def start_p(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls == 'data_contents': self._processInfo() self._insect = 1 def end_p(self): pass def start_h3(self, attrs): if not self._indvd: return self._intitle = 1 def end_h3(self): self._intitle = 0 def start_b(self, attrs): pass def end_b(self): self._insect = 0 def start_span(self, attrs): if not self._indvd: return cls = self.get_attr_value(attrs, 'class') if cls == 'expand_icon': self._processInfo() self._insect = 1 def end_span(self): pass def start_div(self, attrs): if not self._indvd: return cls = self.get_attr_value(attrs, 'class') if cls in ('dvd_row_alt', 'dvd_row'): self._insect = 0 if self._cur_data: self._cur_data += '::' elif cls == 'dvd_section': self._processInfo() self._insect = 1 def end_div(self): pass def _handle_data(self, data): if not self._indvd: return if self._intitle: self._cur_title += data elif self._insect: self._cur_sect += data else: self._cur_data += data class HTMLRecParser(ParserBase): """Parser for the "recommendations" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rparser = HTMLRecParser() result = rparser.parse(recommendations_html_string) """ def _reset(self): """Reset the parser.""" self._rec = {} self._firsttd = 0 self._curlist = u'' self._curtitle = u'' self._startgath = 0 self._intable = 0 self._inb = 0 self._cur_id = u'' self._no_more = 0 def get_data(self): if not self._rec: return {} return {'recommendations': self._rec} def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if href and href.find('RemoveRecommendations') != -1: self._no_more = 1 if self._firsttd: if href: tn = self.re_imdbID.findall(href) if tn: self._cur_id = tn[-1] def end_a(self): pass def start_table(self, attrs): self._intable = 1 def end_table(self): self._intable = 0 self._startgath = 0 def start_tr(self, attrs): self._firsttd = 1 def end_tr(self): pass def start_td(self, attrs): if self._firsttd and not self._no_more: span = self.get_attr_value(attrs, 'colspan') if span: self._firsttd = 0 def end_td(self): if self._firsttd and not self._no_more: self._curtitle = self._curtitle.strip() if self._curtitle: if self._curlist: if self._cur_id: m = Movie(movieID=str(self._cur_id), title=self._curtitle, accessSystem=self._as, modFunct=self._modFunct) self._rec.setdefault(self._curlist, []).append(m) self._cur_id = u'' self._curtitle = u'' self._firsttd = 0 def start_b(self, attrs): self._inb = 1 def end_b(self): self._inb = 0 def _handle_data(self, data): if self._no_more: return ldata = data.lower() if self._intable and self._inb: if ldata.find('suggested by the database') != -1: self._startgath = 1 self._curlist = 'database' elif ldata.find('imdb users recommend') != -1: self._startgath = 1 self._curlist = 'users' elif self._firsttd and self._curlist: self._curtitle += data class HTMLNewsParser(ParserBase): """Parser for the "news" page of a given movie or person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: nwparser = HTMLNewsParser() result = nwparser.parse(news_html_string) """ _defGetRefs = True def _reset(self): """Reset the parser.""" self._cur_news = {} self._news = [] self._cur_stage = 'title' self._cur_text = u'' self._cur_link = u'' self._in_font = 0 def get_data(self): """Return the dictionary.""" if not self._news: return {} return {'news': self._news} def start_p(self, attrs): if self._in_font: return if self._in_content: self._cur_stage = 'title' def end_p(self): if not self._in_content: return if self._in_font: return self.do_br([]) if self._cur_news: self._news.append(self._cur_news) self._cur_news = {} self._cur_stage = 'title' self._cur_text = u'' def start_font(self, attrs): # An hack to prevent IMDbPro sign-up for a two-week free trial # to appear in the title of the first news. self._in_font = 1 def end_font(self): self._in_font = 0 self._cur_text = u'' def do_br(self, attrs): if not self._in_content: return if self._in_font: return self._cur_text = self._cur_text.strip() if self._cur_text: if self._cur_stage == 'body': if self._cur_news.has_key('body'): bodykey = self._cur_news['body'] if bodykey and not bodykey[0].isspace(): self._cur_news['body'] += ' ' self._cur_news['body'] += self._cur_text else: self._cur_news['body'] = self._cur_text else: self._cur_news[self._cur_stage] = self._cur_text self._cur_text = u'' if self._cur_stage == 'title': self._cur_stage = 'date' elif self._cur_stage == 'date': self._cur_stage = 'body' def start_a(self, attrs): if self._in_font: return if self._in_content and self._cur_stage == 'date': href = self.get_attr_value(attrs, 'href') if href: if not href.startswith('http://'): if href[0] == '/': href = href[1:] href = '%s%s' % (imdbURL_base, href) self._cur_news['link'] = href def _handle_data(self, data): if self._in_font: return if self._in_content and not self._in_font: self._cur_text += data class HTMLAmazonReviewsParser(ParserBase): """Parser for the "amazon reviews" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: arparser = HTMLAmazonReviewsParser() result = arparser.parse(amazonreviews_html_string) """ def _reset(self): """Reset the parser.""" self._in_amazonrev = 0 self._inh3 = 0 self._inreview = 0 self._in_kind = 0 self._reviews = [] self._cur_title = u'' self._cur_text = u'' self._cur_link = u'' self._cur_revkind = u'' def get_data(self): """Return the dictionary.""" if not self._reviews: return {} return {'amazon reviews': self._reviews} def start_table(self, attrs): self._in_amazonrev = 1 def end_table(self): if self._inreview: self._add_info() self._cur_title = u'' self._cur_link = u'' self._in_amazonrev = 0 self._inreview = 0 def start_div(self, attrs): theid = self.get_attr_value(attrs, 'id') if theid and theid.find('content') != -1: self._in_amazonrev = 1 def end_div(self): if self._in_amazonrev: self._in_amazonrev = 0 def start_h3(self, attrs): self._inh3 = 1 self._cur_link = u'' self._cur_title = u'' def end_h3(self): self._inh3 = 0 def start_a(self, attrs): if self._inh3: href = self.get_attr_value(attrs, 'href') if href: if not href.startswith('http://'): if href[0] == '/': href = href[1:] href = '%s%s' % (imdbURL_base, href) self._cur_link = href.strip() def end_a(self): pass def start_b(self, attrs): if self._inreview: self._in_kind = 1 def end_b(self): self._in_kind = 0 def start_p(self, attrs): if self._inreview: self._add_info() def end_p(self): self._inreview = 0 self._cur_title = u'' self._cur_link = u'' def _add_info(self): self._cur_title = self._cur_title.replace('\n', ' ').strip() self._cur_text = self._cur_text.replace('\n', ' ').strip() self._cur_link = self._cur_link.strip() self._cur_revkind = self._cur_revkind.replace('\n', ' ').strip() entry = {} if not self._cur_text: return ai = self._cur_text.rfind(' --', -30) author = u'' if ai != -1: author = self._cur_text[ai+3:] self._cur_text = self._cur_text[:ai-1] if self._cur_title and self._cur_title[-1] == ':': self._cur_title = self._cur_title[:-1] if self._cur_revkind and self._cur_revkind[-1] == ':': self._cur_revkind = self._cur_revkind[:-1] if self._cur_title: entry['title'] = self._cur_title if self._cur_text: entry['review'] = self._cur_text if self._cur_link: entry['link'] = self._cur_link if self._cur_revkind: entry['review kind'] = self._cur_revkind if author: entry['review author'] = author if entry: self._reviews.append(entry) self._cur_text = u'' self._cur_revkind = u'' def _handle_data(self, data): if self._inreview: if self._in_kind: self._cur_revkind += data else: self._cur_text += data elif self._in_content and self._inh3: self._inreview = 1 self._cur_title += data class HTMLSalesParser(ParserBase): """Parser for the "merchandising links" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = HTMLSalesParser() result = sparser.parse(sales_html_string) """ # XXX: crap! This parser must be rewritten from scratch. def _reset(self): self._sales = {} self._cur_type = u'' self._cur_info = {} self._in_h5 = 0 self._in_dt = 0 self._get_img = 0 self._get_link = 0 self._cur_descr = u'' self._get_descr = 0 self._in_a = 0 self._in_dd = 0 self._cur_link_text = u'' self._in_table = 0 self._seen_br = 0 self._in_layer = 0 def get_data(self): if not self._sales: return {} return {'merchandising links': self._sales} def _add_entry(self): self._cur_type = self._cur_type.strip() ln = self._cur_info.get('link') descr = self._cur_descr.strip() if self._cur_type and ln and descr and ln[0] != '#': self._cur_info['description'] = descr.replace('\n', '::') self._sales.setdefault(self._cur_type, []).append(self._cur_info) self._cur_info = {} self._cur_descr = u'' def start_h5(self, attrs): self._in_h5 = 1 if not self._in_table: self._get_link = 1 self._add_entry() self._get_descr = 1 self._cur_type = u'' self._seen_br = 0 def end_h5(self): if self._in_h5: self._in_h5 = 0 def start_table(self, attrs): self._in_table = 1 def end_table(self): self._in_table = 0 def start_td(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls: clsl = cls.lower() if clsl == 'w_rowtable_colcover': self._get_img = 1 elif clsl in ('w_rowtable_colshop', 'w_rowtable_coldetails'): self._get_descr = 1 self._cur_descr = u'' def end_td(self): self._get_descr = 0 def start_layer(self, attrs): self._in_layer = 1 def end_layer(self): self._in_layer = 0 def do_hr(self, attrs): self._in_layer = 0 def do_img(self, attrs): if self._get_descr and (self._cur_type and self._in_table == 0 and not self._in_a) and self._cur_descr.strip(): self._add_entry() self._cur_descr = u'' self._get_link = 1 return if self._get_img: self._get_img = 0 src = self.get_attr_value(attrs, 'src') if src: self._cur_info['cover'] = src if self._get_descr: alttxt = self.get_attr_value(attrs, 'alt') if alttxt: self._cur_link_text = alttxt def start_tr(self, attrs): pass def end_tr(self): self._cur_descr = self._cur_descr.strip() if self._cur_descr: self._cur_info['description'] = self._cur_descr self._cur_descr = u'' self._cur_link_text = self._cur_link_text.strip() if self._cur_link_text: self._cur_info['link-text'] = self._cur_link_text self._cur_link_text = u'' ln = self._cur_info.get('link', u'') if ln[0:1] == '#': if self._cur_info.has_key('description'): del self._cur_info['description'] if self._cur_info and ln[0:1] != '#': self._sales.setdefault(self._cur_type, []).append(self._cur_info) self._cur_info = {} def start_dt(self, attrs): self._in_dt = 1 self._cur_type = u'' def end_dt(self): if self._in_dt: self._in_dt = 0 def start_dd(self, attrs): self._in_dd = 1 self._get_link = 1 def end_dd(self): self._in_dd = 0 if self._cur_info.has_key('cover'): del self._cur_info['cover'] self.end_tr() def start_a(self, attrs): self._in_a = 1 href = self.get_attr_value(attrs, 'href') if href: if self._get_img or self._get_link: if href[0] == '/': href = href[1:] href = '%s%s' % (imdbURL_base, href) self._cur_info['link'] = href self._get_link = 0 def end_a(self): self._in_a = 0 def _handle_data(self, data): if not self._in_layer: return if self._in_h5 or self._in_dt: self._cur_type += data.lower() elif self._get_descr or (self._cur_type and self._in_table == 0 and not self._in_a): self._cur_descr += data class HTMLEpisodesParser(ParserBase): """Parser for the "episode list" and "episodes cast" pages of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: eparser = HTMLEpisodesParser() result = eparser.parse(episodes_html_string) """ def _reset(self): self._in_html_title = 0 self._series = None self._series_id = None self._html_title = u'' self._episodes = {} self._in_h1 = 0 self._in_h3 = 0 self._in_h4 = 0 self._cur_season = 0 self._eps_counter = 1 self._in_episodes = 1 self._cur_year = u'????' self._cur_id = u'' self._in_eps_title = 1 self._eps_number = u'' self._eps_title = u'' self._next_is_oad = 0 self._in_oad = 0 self._oad = u'' self._in_plot = 0 self._never_again_in_plot = 0 self._plot = u'' self._in_cast = 0 self._cast = [] self._cur_person = u'' self._cur_person_id = None def get_data(self): if self._episodes: return {'episodes': self._episodes} else: return {} def start_title(self, attrs): self._in_html_title = 1 def end_title(self): self._in_html_title = 0 title = self._html_title.replace('- Episode list', u'').strip() title = title.replace('- Episodes cast', u'').strip() if title: self._series = Movie(title=title, accessSystem=self._as, modFunct=self._modFunct) self._html_title = u'' def start_h1(self, attrs): self._in_h1 = 1 def end_h1(self): self._in_h1 = 0 def start_h3(self, attrs): self._in_h3 = 1 def end_h3(self): self._in_h3 = 0 def start_h4(self, attrs): self._in_h4 = 1 def end_h4(self): if self._in_h4: self._in_h4 = 0 self._next_is_oad = 1 def _add_episode(self): self._eps_title = self._eps_title.strip() if not (self._eps_title and self._cur_id): return epnidx = self._eps_number.find('Episode ') if epnidx != -1: self._eps_number = self._eps_number[epnidx+8:] self._eps_number = self._eps_number.strip().strip(':').strip() try: self._eps_number = int(self._eps_number) except: pass else: self._eps_number = max(self._episodes.get(self._cur_season, {-1: None}).keys()) + 1 eps = Movie(movieID=self._cur_id, title=self._eps_title, accessSystem=self._as, modFunct=self._modFunct) eps['year'] = self._cur_year eps['kind'] = u'episode' eps['season'] = self._cur_season eps['episode'] = self._eps_number eps['episode of'] = self._series self._oad = self._oad.strip() if self._oad.lower().startswith('original air date:'): self._oad = self._oad[18:].lstrip() if self._oad and self._oad != '????': eps['original air date'] = self._oad self._plot = self._plot.strip() if self._plot: eps['plot'] = self._plot if not self._episodes.has_key(self._cur_season): self._episodes[self._cur_season] = {} if self._cast: eps['cast'] = self._cast self._cast = [] self._episodes[self._cur_season][self._eps_number] = eps self._eps_title = u'' self._eps_number = u'' self._cur_id = u'' self._cur_year = u'????' self._oad = u'' self._in_plot = 0 self._never_again_in_plot = 0 self._plot = u'' def start_a(self, attrs): if self._in_h1: href = self.get_attr_value(attrs, 'href') if href and href.startswith('/title/tt'): curid = self.re_imdbID.findall(href) if curid: self._series_id = curid[0] if self._series: self._series.movieID = str(self._series_id) elif self._in_h3: self._add_episode() name = self.get_attr_value(attrs, 'name') if name and name.lower().startswith('season-'): cs = name[7:] try: cs = int(cs) except: pass self._cur_season = cs self._eps_counter = 1 self._in_episodes = 1 elif self._in_episodes: name = self.get_attr_value(attrs, 'name') if name and name.lower().startswith('year-'): self._add_episode() year = name[5:] if year == 'unknown': year = u'????' self._cur_year = year href = self.get_attr_value(attrs, 'href') if href and href.lower().startswith('/title/tt'): curid = self.re_imdbID.findall(href) if curid: self._cur_id = curid[0] self._in_eps_title = 1 elif href and href.lower().startswith('/name/nm'): curid = self.re_imdbID.findall(href) if curid: self._cur_person_id = curid[0] def end_a(self): if self._in_eps_title: self._in_eps_title = 0 def start_b(self, attrs): if self._next_is_oad: self._in_oad = 1 def end_b(self): if self._in_oad: self._next_is_oad = 0 self._in_oad = 0 def start_div(self, attrs): if self._in_episodes: self._add_episode() self._in_episodes = 0 def end_div(self): pass def do_br(self, attrs): if not self._in_episodes: return if not self._never_again_in_plot: self._in_plot = 1 self._never_again_in_plot = 1 def start_table(self, attrs): if not self._in_episodes: return if self._in_plot: self._in_plot = 0 def end_table(self): pass def start_tr(self, attrs): self._in_cast = 1 def end_tr(self): if not self._in_cast: return self._in_cast = 0 name = self._cur_person.strip() if name and self._cur_person_id: note = u'' bni = name.find('(') if bni != -1: eni = name.rfind(')') if eni != -1: note = name[bni:] name = name[:bni].strip() sn = name.split(' ... ') name = sn[0] role = ' '.join(sn[1:]).strip() p = Person(name=name, personID=str(self._cur_person_id), currentRole=role, accessSystem=self._as, notes=note, billingPos=len(self._cast)+1, modFunct=self._modFunct) self._cast.append(p) self._cur_person = u'' self._cur_person_id = None def _handle_data(self, data): if self._in_h1: sldata = data.strip().lower() if sldata .startswith('episodes'): self._in_episodes_h1 = 1 elif self._in_html_title: self._html_title += data elif self._in_eps_title: self._eps_title += data elif self._in_h4: self._eps_number += data elif self._in_oad: self._oad += data elif self._in_plot: self._plot += data elif self._in_cast: self._cur_person += data class HTMLFaqsParser(ParserBase): """Parser for the "FAQ" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: fparser = HTMLFaqsParser() result = fparser.parse(faqs_html_string) """ _defGetRefs = True def _reset(self): self._faqs = [] self._in_wiki_cont = 0 self._in_question = 0 self._in_answer = 0 self._question = u'' self._answer = u'' self._in_spoiler = 0 self._in_pre = 0 def get_data(self): if not self._faqs: return {} return {'faqs': self._faqs} def start_pre(self, attrs): self._in_pre = 1 def end_pre(self): self._in_pre = 0 def start_ul(self, attrs): self._in_wiki_cont = 0 self._question = u'' self._answer = u'' self._in_question = 0 self._in_answer = 0 def end_ul(self): pass def start_div(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls and cls.strip().lower() == 'section': self._in_wiki_cont = 1 self._in_question = 1 self._in_answer = 0 def end_div(self): if not self._in_wiki_cont: return self._question = self._question.strip() self._answer = self._answer.strip() if self._question and self._answer: self._faqs.append('%s::%s' % (self._question, self._answer)) self._in_wiki_cont = 0 self._in_question = 0 self._in_answer = 0 self._question = u'' self._answer = u'' def start_h3(self, attrs): if not self._in_wiki_cont: return self._in_question = 1 def end_h3(self): if not self._in_wiki_cont: return self._in_question = 0 self._in_answer = 1 def do_br(self, attrs): if self._in_answer and self._answer: self._answer += '\n' def start_span(self, attrs): if not self._in_wiki_cont: return cls = self.get_attr_value(attrs, 'class') if cls and cls.strip().lower(): self._in_spoiler = 1 else: return if self._in_answer: self._answer += '[spoiler]' elif self._in_question: self._question += '[spoiler]' def end_span(self): if not self._in_spoiler: return self._in_spoiler = 0 if self._in_answer: self._answer += '[/spoiler]' elif self._in_question: self._question += '[/spoiler]' def _handle_data(self, data): if not self._in_wiki_cont: return if self._in_answer: if not self._in_pre: data = data.replace('\n', ' ') self._answer += data elif self._in_question: self._question += data class HTMLAiringParser(ParserBase): """Parser for the "airing" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: aparser = HTMLAiringParser() result = aparser.parse(airing_html_string) """ def _reset(self): self._air = [] self._in_air_info = 0 self._in_ch = 0 self._cur_info = 'date' self._cur_data = {} self._cur_txt = u'' self._in_html_title = 0 self._title = u'' self._title_kind = u'' self._cur_id = u'' def get_data(self): if not self._air: return {} return {'airing': self._air} def start_title(self, attrs): self._in_html_title = 1 def end_title(self): self._in_html_title = 0 self._title = self._title.strip() self._title_kind = analyze_title(self._title, canonical=1)['kind'] def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if not href: return ids = self.re_imdbID.findall(href) if ids: self._cur_id = ids[-1] def end_a(self): pass def start_b(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls and cls.strip().lower() == 'ch': self._in_ch = 1 def end_b(self): pass def start_table(self, attrs): pass def end_table(self): self._in_air_info = 0 def start_td(self, attrs): if not self._in_air_info: return self._cur_txt = u'' def end_td(self): if not self._in_air_info: return self._cur_txt = self._cur_txt.strip() if self._cur_txt and self._cur_info != 'episode': self._cur_data[self._cur_info] = self._cur_txt self._cur_txt = u'' if self._cur_info == 'date': self._cur_info = 'time' elif self._cur_info == 'time': self._cur_info = 'channel' elif self._cur_info == 'channel': self._cur_info = 'episode' elif self._cur_info == 'episode': if self._title_kind == 'episode': self._cur_info = 'date' else: self._cur_info = 'season' if self._cur_txt and self._title: m = Movie(title='%s {%s}' % (self._title, self._cur_txt), movieID=str(self._cur_id), accessSystem=self._as, modFunct=self._modFunct) self._cur_data['episode'] = m elif self._cur_info == 'season': self._cur_info = 'episode' def start_tr(self, attrs): if not self._in_air_info: return self._cur_txt = u'' self._cur_data = {} self._cur_info = 'date' def end_tr(self): if not self._in_air_info: return if self._cur_data: if 'episode' in self._cur_data: self._air.append(self._cur_data) self._cur_data = {} def _handle_data(self, data): if self._in_ch and data.lower().startswith('next us tv airing'): self._in_air_info = 1 if self._in_html_title: self._title += data if not self._in_air_info: return self._cur_txt += data class HTMLSynopsisParser(ParserBase): """Parser for the "synopsis" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = HTMLSynopsisParser() result = sparser.parse(synopsis_html_string) """ def _reset(self): self._synops = u'' self._in_synops = False def get_data(self): self._synops = self._synops.strip() if not self._synops: return {} return {'synopsis': self._synops} def start_div(self, attrs): cls = self.get_attr_value(attrs, 'class') if not cls: return style = self.get_attr_value(attrs, 'style') if style: return if cls.strip().lower() != 'display': return # Here we are: a div section with 'class' set to "display" and # no 'style' attribute. self._in_synops = True def end_div(self): if self._in_synops: self._in_synops = False def do_br(self, attrs): if not self._in_synops: return self._synops += '\n' def _handle_data(self, data): if not self._in_synops: return self._synops += data class HTMLParentsGuideParser(ParserBase): """Parser for the "parents guide" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: pgparser = HTMLParentsGuideParser() result = pgparser.parse(parentsguide_html_string) """ def _reset(self): self._pg = {} self._in_section = False self._in_h3 = False self._in_display = False self._cur_sect = u'' self._cur_txt = u'' self._seen_br = False def get_data(self): if not self._pg: return {} return {'parents guide': self._pg} def start_div(self, attrs): cls = self.get_attr_value(attrs, 'class') if not cls: return cls = cls.strip().lower() if cls == 'section': self._in_section = True self._in_display = False self._cur_sect = u'' return if cls == 'display': style = self.get_attr_value(attrs, 'style') if not style: self._in_display = True def end_div(self): if self._in_section: self._in_section = False self._cur_sect = self._cur_sect.strip().lower() return if self._in_display: self._in_display = False def start_h3(self, attrs): self._in_h3 = True def end_h3(self): self._in_h3 = False def start_p(self, attrs): pass def end_p(self): if not self._in_display: return self._add_pg() def _add_pg(self): self._cur_txt = self._cur_txt.strip() if not (self._cur_txt and self._cur_sect): self._cur_txt = u'' return self._pg.setdefault(self._cur_sect, []).append(self._cur_txt) self._cur_txt = u'' def do_br(self, attrs): if not self._in_display: return if self._seen_br: self._seen_br = False self._add_pg() if not self._seen_br: self._cur_txt += '\n' self._seen_br = True def _handle_data(self, data): if self._in_section and self._in_h3: self._cur_sect += data elif self._in_display: if self._seen_br and data.strip(): self._seen_br = False data = data.replace('\n', ' ') self._cur_txt += data _OBJECTS = { 'movie_parser': (HTMLMovieParser, None), 'plot_parser': (HTMLPlotParser, None), 'movie_awards_parser': (HTMLAwardsParser, None), 'taglines_parser': (HTMLTaglinesParser, None), 'keywords_parser': (HTMLKeywordsParser, None), 'crazycredits_parser': (HTMLCrazyCreditsParser, None), 'goofs_parser': (HTMLGoofsParser, None), 'alternateversions_parser': (HTMLAlternateVersionsParser, None), 'trivia_parser': (HTMLAlternateVersionsParser, {'kind': 'trivia'}), 'soundtrack_parser': (HTMLAlternateVersionsParser, {'kind': 'soundtrack'}), 'quotes_parser': (HTMLQuotesParser, None), 'releasedates_parser': (HTMLReleaseinfoParser, None), 'ratings_parser': (HTMLRatingsParser, None), 'officialsites_parser': (HTMLOfficialsitesParser, None), 'externalrev_parser': (HTMLOfficialsitesParser, {'kind': 'external reviews'}), 'newsgrouprev_parser': (HTMLOfficialsitesParser, {'kind': 'newsgroup reviews'}), 'misclinks_parser': (HTMLOfficialsitesParser, {'kind': 'misc links'}), 'soundclips_parser': (HTMLOfficialsitesParser, {'kind': 'sound clips'}), 'videoclips_parser': (HTMLOfficialsitesParser, {'kind': 'video clips'}), 'photosites_parser': (HTMLOfficialsitesParser, {'kind': 'photo sites'}), 'connections_parser': (HTMLConnectionParser, None), 'tech_parser': (HTMLTechParser, None), 'business_parser': (HTMLTechParser, {'kind': 'business', '_defGetRefs': 1}), 'literature_parser': (HTMLTechParser, {'kind': 'literature'}), 'locations_parser': (HTMLLocationsParser, None), 'dvd_parser': (HTMLDvdParser, None), 'rec_parser': (HTMLRecParser, None), 'news_parser': (HTMLNewsParser, None), 'amazonrev_parser': (HTMLAmazonReviewsParser, None), 'sales_parser': (HTMLSalesParser, None), 'episodes_parser': (HTMLEpisodesParser, None), 'eprating_parser': (HTMLEpisodesRatings, None), 'movie_faqs_parser': (HTMLFaqsParser, None), 'airing_parser': (HTMLAiringParser, None), 'synopsis_parser': (HTMLSynopsisParser, None), 'parentsguide_parser': (HTMLParentsGuideParser, None) }