""" parser.http.characterParser module (imdb package). This module provides the classes (and the instances), used to parse the IMDb pages on the akas.imdb.com server about a character. E.g., for "Jesse James" the referred pages would be: main details: http://www.imdb.com/character/ch0000001/ biography: http://www.imdb.com/character/ch0000001/bio ...and so on... Copyright 2007 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from utils import ParserBase from imdb.Movie import Movie class HTMLCharacterBioParser(ParserBase): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = HTMLCharacterBioParser() result = bparser.parse(character_biography_html_string) """ _defGetRefs = True def _reset(self): self._bios = [] self._intro = u'' self._in_wiki_cont = False self._in_h4 = False self._cur_bio = u'' self._title = u'' self._in_table = False def get_data(self): res = {} if self._bios: res['biography'] = self._bios if self._intro: res['introduction'] = self._intro if not res: return {} return res def start_div(self, attrs): if not self._in_content: return cls = self.get_attr_value(attrs, 'class') if not cls: return clss = cls.strip().lower() if clss == 'display': style = self.get_attr_value(attrs, 'style') if not style: self._in_wiki_cont = True else: self._in_wiki_cont = False self._add_item() def start_h4(self, attrs): if not self._in_wiki_cont: return self._in_h4 = True self._add_item() def end_h4(self): if not self._in_wiki_cont: return self._in_h4 = False def _add_item(self): self._title = self._title.strip() self._cur_bio = self._cur_bio.strip() if not (self._title and self._cur_bio): if self._cur_bio and not self._title: self._intro = self._cur_bio self._title = u'' self._cur_bio = u'' return self._bios.append('%s::%s' % (self._title, self._cur_bio)) self._title = u'' self._cur_bio = u'' def start_table(self, attrs): if not self._in_wiki_cont: return self._in_table = True def end_table(self): if not self._in_wiki_cont: return self._in_table = False def do_br(self, attrs): if not self._in_wiki_cont: return if self._in_table: return self._cur_bio += '\n' def _handle_data(self, data): if not self._in_wiki_cont: return if self._in_h4: self._title += data elif not self._in_table: self._cur_bio += data.replace('\n', ' ') class HTMLCharacterQuotesParser(ParserBase): """Parser for the "quotes" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = HTMLCharacterQuotesParser() result = qparser.parse(character_quotes_html_string) """ _defGetRefs = True def _reset(self): """Reset the parser.""" self._tot_quotes = {} self._quotes = [u''] self._cur_title = u'' self._cur_titleID = None self._in_quote = False self._in_title = False self._seen_br = False def get_data(self): """Return the dictionary.""" if not self._tot_quotes: return {} return {'quotes': self._tot_quotes} def start_a(self, attrs): if not self._in_quote: return if not self._in_title: return href = self.get_attr_value(attrs, 'href') if not href: return curid = self.re_imdbID.findall(href) if curid: self._cur_titleID = curid[-1] def end_a(self): pass def start_h5(self, attrs): if not self._in_content: return self._add_items() self._in_title = True self._cur_title = u'' self._cur_titleID = None self._in_quote = True def end_h5(self): self._in_title = False def _add_items(self): self._quotes = [x.replace(':: ', '::').replace(' ::', '::').rstrip(':') for x in self._quotes] self._quotes = [x.replace(' ', ' ').replace(' ', ' ').strip() for x in self._quotes] self._quotes = filter(None, self._quotes) if not (self._cur_title and self._cur_titleID and self._quotes): self._quotes = [u''] return movie = Movie(title=self._cur_title, movieID=self._cur_titleID, accessSystem=self._as, modFunct=self._modFunct) self._tot_quotes[movie] = self._quotes[:] self._quotes = [u''] def do_br(self, attrs): if not self._in_quote: return clear = self.get_attr_value(attrs, 'clear') if clear and clear.lower() == 'both': self._add_items() self._in_quote = False if self._seen_br: self._quotes.append(u'') self._seen_br = False else: self._quotes[-1] = self._quotes[-1].rstrip() if not self._quotes[-1].endswith('::'): self._quotes[-1] += '::' self._seen_br = True def _handle_data(self, data): if not self._in_quote: return self._seen_br = False if self._in_title: self._cur_title += data else: self._quotes[-1] += data from personParser import HTMLMaindetailsParser from personParser import HTMLSeriesParser _OBJECTS = { 'character_main_parser': (HTMLMaindetailsParser, {'kind': 'character'}), 'character_series_parser': (HTMLSeriesParser, None), 'character_bio_parser': (HTMLCharacterBioParser, None), 'character_quotes_parser': (HTMLCharacterQuotesParser, None) }