""" parser.http.searchMovieParser module (imdb package). This module provides the HTMLSearchMovieParser class (and the search_movie_parser instance), used to parse the results of a search for a given title. E.g., for when searching for the title "the passion", the parsed page would be: http://akas.imdb.com/find?q=the+passion&tt=on&mx=20 Copyright 2004-2007 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from imdb.utils import analyze_title, analyze_name from utils import ParserBase from imdb.Movie import Movie class BasicMovieParser(ParserBase): """Simply get the title of a movie and the imdbID. It's used by the HTMLSearchMovieParser class to return a result for a direct match (when a search on IMDb results in a single movie, the web server sends directly the movie page.""" def _reset(self): """Reset the parser.""" self._result = {} self._movieID = None self._reading_page_title = 0 self._page_title = u'' self._inbch = 0 self._in_series_title = 0 self._in_series_info = 0 self.__seriesID = None self._series_title = u'' self._series_info = u'' def get_data(self): """Return a list with a single tuple ('movieID', {title_dict}) where movieID is the imdbID. """ if self._result and self._movieID: return [(self._movieID, self._result)] return self._result def start_title(self, attrs): self._reading_page_title = 1 def end_title(self): self._reading_page_title = 0 t = self._page_title.strip() if t.find('IMDb Title') != -1 and t.find('Search') != -1: return self._result = analyze_title(t, canonical=1) def start_input(self, attrs): # XXX: read the movieID from the "send this page to a friend" form. t = self.get_attr_value(attrs, 'type') if t and t.strip().lower() == 'hidden': n = self.get_attr_value(attrs, 'name') if n: n = n.strip().lower() if n in ('arg', 'auto'): val = self.get_attr_value(attrs, 'value') or u'' # XXX: use re_imdbIDonly because in the input field # the movieID is not preceded by 'tt'. if n == 'arg': nr = self.re_imdbIDonly.findall(val) else: nr = self.re_imdbID.findall(val) if not nr: return imdbID = str(nr[0]) self._movieID = imdbID def end_input(self): pass def start_b(self, attrs): cls = self.get_attr_value(attrs, 'class') if cls and cls.lower() == 'ch': self._inbch = 1 def end_b(self): if self._inbch: self._inbch = 0 def start_a(self, attrs): if self._in_series_title: href = self.get_attr_value(attrs, 'href') if not href: return ids = self.re_imdbID.findall(href) if ids: self.__seriesID = ids[-1] def end_a(self): pass def do_br(self, attrs): if self._in_series_title: self._in_series_title = 0 st = self._series_title.strip() if st and self.__seriesID: d_title = analyze_title(st, canonical=1) m = Movie(movieID=str(self.__seriesID), data=d_title, accessSystem=self._as, modFunct=self._modFunct) self._result['kind'] = u'episode' self._result['episode of'] = m self._series_title = u'' elif self._in_series_info: self._in_series_info = 0 si = ' '.join([x for x in self._series_info.split() if x]) if si: aid = self.re_airdate.findall(si) if aid and len(aid[0]) == 3: date, season, episode = aid[0] date = date.strip() try: season = int(season) except: pass try: episode = int(episode) except: pass if date and date != '????': self._result['original air date'] = date # Handle also "episode 0". if season or type(season) is type(0): self._result['season'] = season if episode or type(season) is type(0): self._result['episode'] = episode self._series_info = u'' def _handle_data(self, data): if self._reading_page_title: self._page_title += data elif self._in_series_title: self._series_title += data elif self._in_series_info: self._series_info += data elif self._inbch: sldata = data.strip().lower() if sldata.startswith('tv series:'): self._in_series_title = 1 elif sldata.startswith('original air date'): self._in_series_info = 1 def _dontChange(s, *args, **kwds): """Return the name (useful for characters objects).""" return {'name': s} class HTMLSearchMovieParser(ParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for both movies and persons.""" # Customizations for movie, person and character parsers. _k = { 'movie': {'analyze_f': analyze_title, 'link': '/title', 'in title': 'imdb title'}, 'person': {'analyze_f': analyze_name, 'link': '/name', 'in title': 'imdb name'}, 'character': {'analyze_f': _dontChange, 'link': '/character', 'in title': 'imdb search'} } def _init(self): """Initialize the parser.""" self.kind = 'movie' self._basic_parser = BasicMovieParser def _reset(self): """Reset the parser.""" self._results = [] self._is_title = False self._reading_page_title = False self._current_imdbID = u'' self._current_ton = u'' self._no_more = False self._stop = False self._in_table = False self._col_nr = 0 def parse(self, cont, results=None, **kwds): self.maxres = results return ParserBase.parse(self, cont) def get_data(self): """Return a list of ('imdbID', {title_dict/name_dict}) tuples.""" return self._results def start_title(self, attrs): self._reading_page_title = True def end_title(self): self._reading_page_title = False def start_table(self, attrs): self._in_table = True def end_table(self): self._in_table = False def start_tr(self, attrs): if not self._in_table: return self._col_nr = 0 self._no_more = False def end_tr(self): pass def start_td(self, attrs): if not self._in_table: return self._col_nr += 1 self._is_title = False self._current_imdbID = None def end_td(self): if self._in_table and self._is_title and self._current_imdbID and \ self._col_nr == 3: # We should have got the title/name. title = self._current_ton.strip() tup = (self._current_imdbID, self._k[self.kind]['analyze_f'](title, canonical=1)) self._results.append(tup) if self.maxres is not None and self.maxres <= len(self._results): self._stop = True self._current_ton = u'' self._current_imdbID = u'' self._is_title = False self._no_more = 0 def start_a(self, attrs): # Prevent tv series to get the (wrong) movieID from the # last episode, sometimes listed in the ... tag # along with the series' title. if self._current_imdbID: return if not self._in_table and self._col_nr == 3: return link = self.get_attr_value(attrs, 'href') # The next data is a movie title/person name; now store the imdbID. if link and link.lower().startswith(self._k[self.kind]['link']): nr = self.re_imdbID.findall(link[6:]) if not nr: return self._current_imdbID = str(nr[0]) self._is_title = True def end_a(self): pass def start_small(self, attrs): self._no_more = True def end_small(self): pass def do_br(self, attrs): self._no_more = True def _handle_data(self, data): if self._stop: res = self._results self.reset() self._results = res return if self._in_table and self._col_nr == 3 and not self._no_more: self._current_ton += data elif self._reading_page_title: dls = data.strip().lower().replace(' ', ' ') if not dls.startswith(self._k[self.kind]['in title']): # XXX: a direct result! # Interrupt parsing, and retrieve data using a # BasicMovieParser/BasicPersonParser object. rawdata = self.rawdata # XXX: it' would be much better to move this code to # the end_title() method, but it would raise an # exception... self.reset() # Get imdbID and title directly from the "main details" page. bmp = self._basic_parser() self._results = bmp.parse(rawdata)['data'] _OBJECTS = { 'search_movie_parser': (HTMLSearchMovieParser, None) }