"""
parser.http.personParser module (imdb package).

This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a person.
E.g., for "Mel Gibson" the referred pages would be:
    categorized:    http://akas.imdb.com/name/nm0000154/maindetails
    biography:      http://akas.imdb.com/name/nm0000154/bio
    ...and so on...

Copyright 2004-2007 Davide Alberani <da@erlug.linux.it>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

from imdb.Movie import Movie
from imdb.utils import analyze_name, canonicalName, \
                        normalizeName, analyze_title, date_and_notes
from utils import ParserBase, build_movie


class HTMLMaindetailsParser(ParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = HTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """

    def _init(self):
        self.kind = 'person'

    def _reset(self):
        self._data = {}
        self._in_title = False
        self._title = u''
        # Get section names.
        self._in_h5 = False
        self._section = u''
        # Sections are ended by br tags.
        self._in_post_section = False
        # Stop before the Additional Details section.
        self._stop_here = False
        self._in_tn15more = False
        # Most of the information are stored here.
        self._cur_txt = u''
        # Handle filmography.
        self._in_ol = False
        self._in_i = False
        self._in_movie = False
        # The movie title/role/notes is stored here.
        self._movie = u''
        # Stop before akas/episodes information.
        self._seen_br = False
        self._in_headshot = False
        self._cur_status = u''
        # Get a movieID.
        self._last_imdbID = None
        self._last_nameIDs = []
        self._get_imdbID = False
        self._cids = []
        self._seen_movie_sep = False

    def get_data(self):
        return self._data

    def start_title(self, attrs):
        self._in_title = True

    def end_title(self):
        self._in_title = False
        self._title = self._title.strip()
        if self._title:
            if self.kind != 'character':
                self._data.update(analyze_name(self._title, canonical=1))
            else:
                self._title = self._title.replace('(Character)', '').strip()
                self._data['name'] = self._title

    def start_h5(self, attrs):
        if self._stop_here or not self._in_content: return
        self._seen_br = False
        self._in_h5 = True
        self._in_post_section = False
        self._section = u''
        self._cur_txt = u''
        self._in_movie = False

    def end_h5(self):
        if self._stop_here or not self._in_content: return
        self._in_h5 = False
        self._section = str(self._section.strip().lower())
        if self._section[-1:] == ':':
            self._section = self._section[:-1].rstrip()
        # XXX: I don't like this way at all: here we're excluding some
        #      useless section, but who knows how many other there can be?
        if self._section in (u'', 'genres', 'awards', 'publicity listings',
                            'other works', 'trivia'):
            return
        # Do some basic transformation.
        if self._section in ('sometimes credited as', 'alternate names'):
            self._section = 'akas'
        elif self._section == 'date of birth':
            self._section = 'birth date'
        elif self._section == 'date of death':
            self._section = 'death date'
        self._in_post_section = True

    def do_img(self, attrs):
        if self._in_headshot:
            src = self.get_attr_value(attrs, 'src')
            if src:
                self._data['headshot'] = src
        if self._stop_here or not self._in_content: return
        if self.get_attr_value(attrs, 'alt') == 'Additional Details':
            self._stop_here = True

    def do_input(self, attrs):
        itype = self.get_attr_value(attrs, 'type')
        if itype is None or itype.lower() != 'hidden': return
        iname = self.get_attr_value(attrs, 'name')
        if iname is None or iname != 'primary': return
        ivalue = self.get_attr_value(attrs, 'value')
        if ivalue is None: return
        # It's hard to catch the correct 'Surname, Name' from the
        # title, so if the "credited alongside another name" form
        # is found, use it.
        self._data.update(analyze_name(ivalue, canonical=0))

    def start_div(self, attrs): pass

    def end_div(self):
        pass
        self.do_br([])

    def do_br(self, attrs):
        if self._stop_here or not self._in_content: return
        # Inside li tags in filmography, some useless information after a br.
        self._seen_br = True
        self._cur_txt = self._cur_txt.strip()
        if not (self._in_post_section and self._section and self._cur_txt):
            self._in_post_section = False
            self._cur_txt = u''
            return
        # We're at the end of a section.
        if self._section == 'birth date':
            date, notes = date_and_notes(self._cur_txt)
            if date:
                self._data['birth date'] = date
            if notes:
                self._data['birth notes'] = notes
        elif self._section == 'death date':
            date, notes = date_and_notes(self._cur_txt)
            if date:
                self._data['death date'] = date
            if notes:
                self._data['death notes'] = notes
        elif self._section == 'akas':
            akas = self._cur_txt.split(' / ')
            if akas: self._data['akas'] = akas
        # XXX: not providing an 'else', we're deliberately ignoring
        #      other sections.
        self._in_post_section = False
        if self.kind == 'character':
            # XXX: I'm not confident this is the best place for this...
            self._section = 'filmography'
        self._cur_txt = u''

    def start_a(self, attrs):
        name = self.get_attr_value(attrs, 'name')
        if name and name.lower() == 'headshot':
            self._in_headshot = True
        if self._stop_here or not self._in_content: return
        cls = self.get_attr_value(attrs, 'class')
        # Detect "more" links.
        if cls and cls.startswith('tn15more'):
            self._in_tn15more = True
        href = self.get_attr_value(attrs, 'href')
        if href and href.find('/character/ch') != -1:
            imdbID = self.re_imdbID.findall(href)
            if imdbID:
                self._cids.append(imdbID[-1])
        elif href and href.find('/name/nm') != -1:
            imdbID = self.re_imdbID.findall(href)
            if imdbID:
                self._last_nameIDs.append(imdbID[-1])
        if not (self._in_movie and self._get_imdbID): return
        # A movie title.
        if href and href.find('/title/tt') != -1:
            imdbID = self.re_imdbID.findall(href)
            if imdbID:
                self._last_imdbID = imdbID[-1]
                self._get_imdbID = False

    def end_a(self):
        if self._in_headshot:
            self._in_headshot = False
        if self._in_tn15more:
            self._in_tn15more = False

    def start_ol(self, attrs):
        if self._stop_here or not self._in_content: return
        # We're not in a informational section, but in a filmography list.
        self._in_post_section = False
        self._in_ol = True
        self._cur_txt = u''

    def end_ol(self):
        if self._stop_here or not self._in_content: return
        self._in_ol = False

    def start_li(self, attrs):
        if self._stop_here or not self._in_content: return
        # We're reading a movie title/role/notes.
        if self._in_ol:
            self._get_imdbID = True
            self._in_movie = True
        self._last_imdbID = None
        self._last_nameIDs = []
        self._cids = []
        self._cur_status = u''
        self._movie = u''
        self._seen_movie_sep = False
        self._seen_br = False

    def end_li(self):
        if self._stop_here or not self._in_content: return
        self._get_imdbID = False
        if not self._in_movie: return
        self._movie = self._movie.strip()
        self._cur_status = self._cur_status.strip()
        if not (self._movie and self._last_imdbID and self._section): return
        if not self._cids:
            self._cids = None
        elif len(self._cids) == 1:
            self._cids = self._cids[0]
        # Add this movie to the list.
        kwds = {'movieID': self._last_imdbID, 'status': self._cur_status,
                'roleID': self._cids, 'modFunct': self._modFunct,
                'accessSystem': self._as}
        if self.kind == 'character':
            kwds['_parsingCharacter'] = True
            lnids = self._last_nameIDs
            if not lnids:
                lnids = None
            elif len(lnids) == 1:
                lnids = lnids[0]
            kwds['roleID'] = lnids
        movie = build_movie(self._movie, **kwds)
        self._data.setdefault(self._section, []).append(movie)

    def start_i(self, attrs):
        self._in_i = True

    def end_i(self):
        self._in_i = False

    def _handle_data(self, data):
        if self._in_title:
            self._title += data
        if self._stop_here or not self._in_content: return
        if self._in_h5:
            self._section += data
        elif self._in_post_section and not self._in_tn15more:
            self._cur_txt += data
        elif self._in_movie and not self._seen_br:
            if self._in_i:
                self._cur_status += data
            else:
                # XXX: keeps multiple characterIDs separated; quite a mess.
                if self._section in ('actor', 'actress', 'self'):
                    ldata = data
                    if not self._seen_movie_sep:
                        # Consider only ' / ' after the separator.
                        sepIdx = ldata.find(' ....')
                        if sepIdx != -1:
                            ldata = ldata[sepIdx+5:]
                    nrSep = ldata.count(' / ')
                    if nrSep > 0:
                        sdata = data.strip()
                        if sdata.endswith(' /') and sdata.startswith('/ '):
                            nrSep -= 1
                        self._cids += [None]*nrSep
                if not self._seen_movie_sep and data.find(' ....') != -1:
                    self._seen_movie_sep = True
                self._movie += data


class HTMLBioParser(ParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = HTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    # XXX: this parser is really old; consider a rewrite.
    _defGetRefs = True

    def _init(self):
        # This is the dictionary that will be returned by the parse() method.
        self._bio_data = {}

    def _reset(self):
        """Reset the parser."""
        self._bio_data.clear()
        self._sect_name = u''
        self._sect_data = u''
        self._in_sect = 0
        self._in_sect_name = 0

    def get_data(self):
        """Return the dictionary."""
        return self._bio_data

    def _end_content(self):
        self._add_items()
        if self._bio_data.has_key('mini biography'):
            nl = []
            for bio in self._bio_data['mini biography']:
                byidx = bio.rfind('IMDb Mini Biography By')
                if byidx != -1:
                    bio = u'%s::%s' % (bio[byidx+23:].lstrip(),
                                        bio[:byidx].rstrip())
                nl.append(bio)
            self._bio_data['mini biography'][:] = nl

    def do_br(self, attrs):
        snl = self._sect_name.lower()
        if snl != 'mini biography':
            self._add_items()
        if snl == 'mini biography' and self._sect_data \
                and  not self._sect_data[-1].isspace():
            self._sect_data += ' '

    def start_a(self, attrs): pass

    def end_a(self): pass

    def start_h5(self, attrs):
        if self._in_content:
            self._add_items()
            self._in_sect = 1
            self._in_sect_name = 1
            self._sect_name = u''

    def end_h5(self):
        if self._in_sect_name:
            self._in_sect_name = 0

    def do_hr(self, attrs):
        if self._in_content: self._in_content = 0

    def start_dd(self, attrs): pass

    def _add_items(self):
        # Add a new section in the biography.
        if self._in_content and self._sect_name and self._sect_data:
            sect = self._sect_name.strip().lower()
            # XXX: to get rid of the last colons and normalize section names.
            if sect[-1] == ':':
                sect = sect[:-1]
            if sect == 'salary':
                sect = 'salary history'
            elif sect == 'nickname':
                sect = 'nick names'
            elif sect == 'where are they now':
                sect = 'where now'
            elif sect == 'personal quotes':
                sect = 'quotes'
            elif sect == 'date of birth':
                sect = 'birth date'
            elif sect == 'date of death':
                sect = 'death date'
            data = self._sect_data.strip()
            d_split = data.split('::')
            d_split[:] = filter(None, [x.strip() for x in d_split])
            # Do some transformation on some special cases.
            if sect == 'salary history':
                newdata = []
                for j in d_split:
                    j = filter(None, [x.strip() for x in j.split('@@@@')])
                    newdata.append('::'.join(j))
                d_split[:] = newdata
            elif sect == 'nick names':
                d_split[:] = [normalizeName(x) for x in d_split]
            elif sect == 'birth name':
                d_split = canonicalName(d_split[0])
            elif sect == 'height':
                d_split = d_split[0]
            elif sect == 'spouse':
                d_split[:] = [x.replace(' (', '::(', 1).replace(' ::', '::')
                                for x in d_split]
            # Birth/death date are in both maindetails and bio pages;
            # it's safe to collect both of them.
            if sect == 'birth date':
                date, notes = date_and_notes(d_split[0])
                if date:
                    self._bio_data['birth date'] = date
                if notes:
                    self._bio_data['birth notes'] = notes
            elif sect == 'death date':
                date, notes = date_and_notes(d_split[0])
                if date:
                    self._bio_data['death date'] = date
                if notes:
                    self._bio_data['death notes'] = notes
            elif d_split:
                # Multiple items are added separately (e.g.: 'trivia' is
                # a list of strings).
                self._bio_data[sect] = d_split
        self._sect_name = u''
        self._sect_data = u''
        self._in_sect = 0

    def start_p(self, attrs):
        if self._in_sect:
            if self._sect_data:
                if self._sect_data[-1].isspace():
                    self._sect_data = self._sect_data.rstrip()
                self._sect_data += '::'

    def end_p(self): pass

    def start_tr(self, attrs):
        if self._in_sect:
            if self._sect_data:
                if self._sect_data[-1].isspace():
                    self._sect_data = self._sect_data.rstrip()
                self._sect_data += '::'

    def end_tr(self): pass

    def start_td(self, attrs): pass

    def end_td(self):
        if self._in_sect and \
                self._sect_name.strip().lower() in \
                ('salary', 'salary history'):
            if self._sect_data: self._sect_data += '@@@@'

    def _handle_data(self, data):
        if self._in_sect_name:
            self._sect_name += data
        elif self._in_sect:
            self._sect_data += data.replace('\n', ' ')


class HTMLOtherWorksParser(ParserBase):
    """Parser for the "other works" and "agent" pages of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        owparser = HTMLOtherWorksParser()
        result = owparser.parse(otherworks_html_string)
    """
    _defGetRefs = True

    def _init(self):
        self.kind = 'other works'

    def _reset(self):
        """Reset the parser."""
        self._ow = []
        self._cow = u''
        self._dostrip = 0
        self._seen_hr = 0
        self._seen_h5 = 0
        self._seen_left_div = 0

    def get_data(self):
        """Return the dictionary."""
        if not self._ow: return {}
        return {self.kind: self._ow}

    def start_b(self, attrs): pass

    def end_b(self):
        if self._seen_hr: return
        if self.kind == 'agent' and self._in_content and self._cow:
            self._cow += '::'
            self._dostrip = 1

    def start_h5(self, attrs): pass

    def end_h5(self):
        self._seen_h5 = 1

    def start_div(self, attrs):
        cls = self.get_attr_value(attrs, 'class')
        if cls and cls.strip().lower() == 'left':
            self._seen_left_div = 1

    def end_div(self): pass

    def do_hr(self, attrs):
        self._seen_hr = 1

    def do_br(self, attrs):
        if self._seen_hr: return
        self._cow = self._cow.strip()
        if self._in_content and self._cow:
            self._ow.append(self._cow)
            self._cow = u''

    def _handle_data(self, data):
        if not self._seen_h5: return
        if self._seen_hr or self._seen_left_div: return
        if self._in_content:
            if self._dostrip:
                data = data.lstrip()
                if data: self._dostrip = 0
            self._cow += data


class HTMLSeriesParser(ParserBase):
    """Parser for the "by TV series" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        sparser = HTMLSeriesParser()
        result = sparser.parse(filmoseries_html_string)
    """
    def _reset(self):
        """Reset the parser."""
        self._episodes = {}
        self._seen_h1 = 0
        self._in_episodes = 0
        self._in_ol = 0
        self._in_li = 0
        self._in_series_title = 0
        self._series_id = None
        self._cur_series_title = u''
        self._cur_series = None
        self._in_episode_title = 0
        self._episode_id = None
        self._cur_episode_title = u''
        self._in_misc_info = 0
        self._misc_info = u''
        self._in_i = 0
        self._got_i_info = 0
        self._in_span = 0

    def get_data(self):
        """Return the dictionary."""
        if not self._episodes: return {}
        return {'episodes': self._episodes}

    def start_h1(self, attrs): self._seen_h1 = 1

    def end_h1(self): self._seen_h1 = 0

    def start_i(self, attrs): self._in_i = 1

    def end_i(self): self._in_i = 0

    def start_div(self, attrs):
        if not self._in_content: return
        if self.get_attr_value(attrs, 'class') == 'filmo':
            self._in_episodes = 1

    def end_div(self):
        if self._in_episodes: self._in_episodes = 0

    def start_ol(self, attrs): self._in_ol = 1

    def end_ol(self):
        self._in_ol = 0
        self._cur_series_title = u''
        self._cur_series = None
        self._series_id = None

    def start_li(self, attrs):
        self._in_li = 1
        self._got_i_info = 0

    def end_li(self):
        self._in_li = 0
        if self._in_episodes:
            et = self._cur_episode_title.strip()
            minfo = self._misc_info.strip()
            if et and self._episode_id:
                eps_data = analyze_title(et, canonical=1)
                eps_data['kind'] = u'episode'
                e = Movie(movieID=str(self._episode_id), data=eps_data,
                            accessSystem=self._as, modFunct=self._modFunct)
                e['episode of'] = self._cur_series
                if minfo.startswith('('):
                    pe = minfo.find(')')
                    if pe != -1:
                        date = minfo[1:pe]
                        if date != '????':
                            e['original air date'] = date
                            if eps_data.get('year', '????') == '????':
                                syear = date.split()[-1]
                                if syear.isdigit():
                                    e['year'] = syear
                rolei = minfo.find(' - ')
                if rolei != -1:
                    if not self._got_i_info:
                        role = u''
                        role = minfo[rolei+3:].strip()
                        notei = role.rfind('(')
                        note = u''
                        if notei != -1 and role and role[-1] == ')':
                            note = role[notei:]
                            role = role[:notei].strip()
                        e.notes = note
                        e.currentRole = role
                    else:
                        randn = minfo[rolei+3:].strip().split()
                        note = '[%s]' % randn[0]
                        note += ' '.join(randn[1:])
                        e.notes = note
                self._episodes.setdefault(self._cur_series, []).append(e)
            self._cur_episode_title = u''
            self._episode_id = None
        self._in_misc_info = 0
        self._misc_info = u''

    def start_span(self, attrs):
        self._in_span = 1

    def end_span(self):
        self._in_span = 0

    def start_a(self, attrs):
        if not self._in_episodes: return
        if self._in_ol:
            if not self._in_li: return
            href = self.get_attr_value(attrs, 'href')
            if not href: return
            if 'character/ch' in href: return
            mid = self.re_imdbID.findall(href)
            if not mid: return
            self._in_episode_title = 1
            self._episode_id = mid[0]
            return
        elif self._in_span:
            href = self.get_attr_value(attrs, 'href')
            if not href: return
            mid = self.re_imdbID.findall(href)
            if not mid: return
            self._in_series_title = 1
            self._series_id = mid[0]

    def end_a(self):
        if self._in_episode_title:
            self._in_episode_title = 0
            self._in_misc_info = 1
        elif self._in_series_title:
            self._in_series_title = 0
            st = self._cur_series_title.strip()
            if st and self._series_id is not None:
                series_data = analyze_title(st, canonical=1)
                s = Movie(movieID=str(self._series_id), data=series_data,
                            accessSystem=self._as, modFunct=self._modFunct)
                self._cur_series = s

    def _handle_data(self, data):
        if self._in_episode_title:
            self._cur_episode_title += data
        elif self._in_series_title:
            self._cur_series_title += data
        elif self._in_misc_info:
            # Handles roles like "director".
            if self._in_i:
                # Put these info in the "notes" property.
                data = data.lower()
                self._got_i_info = 1
            self._misc_info += data


class HTMLPersonGenresParser(ParserBase):
    """Parser for the "by genre" and "by keywords" pages of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        gparser = HTMLPersonGenresParser()
        result = gparser.parse(bygenre_html_string)
    """
    kind = 'genres'

    def _reset(self):
        """Reset the parser."""
        self._info = {}
        self._cur_key = ''
        self._cur_title = u''
        self._in_table = False
        self._in_li = False
        self._cur_movieID = None
        self._cur_characterID = None

    def get_data(self):
        """Return the dictionary."""
        if not self._info: return {}
        return {self.kind: self._info}

    def start_table(self, attrs):
        if not self._in_content: return
        self._in_table = True

    def end_table(self):
        self._in_table = False

    def start_a(self, attrs):
        if not self._in_content: return
        if self._in_li:
            href = self.get_attr_value(attrs, 'href')
            if href:
                imdbID = self.re_imdbID.findall(href)
                if imdbID:
                    if 'title/tt' in href:
                        self._cur_movieID = imdbID[-1]
                    elif 'character/ch' in href:
                        self._cur_characterID = imdbID[-1]
                    return
        if not self._in_table: return
        name = self.get_attr_value(attrs, 'name')
        if name:
            self._cur_key = name

    def end_a(self): pass

    def start_li(self, attrs):
        if not (self._in_content and self._cur_key): return
        self._in_li = True

    def end_li(self):
        self._in_li = False
        self._add_info()

    def do_br(self, attrs):
        if not self._in_li: return
        self._in_li = False
        self._add_info()

    def _add_info(self):
        self._cur_key = self._cur_key.strip()
        self._cur_title = self._cur_title.strip()
        if not (self._cur_key and self._cur_title and self._cur_movieID):
            self._cur_title = u''
            self._cur_movieID = None
            self._cur_characterID = None
            return
        ridx = self._cur_title.find('[')
        notes = u''
        if ridx != -1:
            notes = self._cur_title[ridx:].lstrip()
            self._cur_title = self._cur_title[:ridx].rstrip()
        m = build_movie(self._cur_title, movieID=self._cur_movieID,
                        roleID=self._cur_characterID, modFunct=self._modFunct,
                        accessSystem=self._as)
        m.notes = notes
        self._info.setdefault(self._cur_key.replace('X2D', '-'), []).append(m)
        self._cur_title = u''
        self._cur_movieID = None
        self._cur_characterID = None

    def _handle_data(self, data):
        if not self._in_li: return
        self._cur_title += data


from movieParser import HTMLOfficialsitesParser
from movieParser import HTMLAwardsParser
from movieParser import HTMLTechParser
from movieParser import HTMLNewsParser
from movieParser import HTMLLocationsParser
from movieParser import HTMLSalesParser

_OBJECTS = {
    'maindetails_parser': (HTMLMaindetailsParser, None),
    'bio_parser': (HTMLBioParser, None),
    'otherworks_parser': (HTMLOtherWorksParser, None),
    'agent_parser': (HTMLOtherWorksParser, {'kind': 'agent'}),
    'person_officialsites_parser': (HTMLOfficialsitesParser, None),
    'person_awards_parser': (HTMLAwardsParser, {'subject': 'name'}),
    'publicity_parser': (HTMLTechParser, {'kind': 'publicity'}),
    'person_series_parser': (HTMLSeriesParser, None),
    'person_contacts_parser': (HTMLTechParser, {'kind': 'contacts'}),
    'person_genres_parser': (HTMLPersonGenresParser, None),
    'person_keywords_parser': (HTMLPersonGenresParser, {'kind': 'keywords'}),
    'news_parser':  (HTMLNewsParser, None),
    'sales_parser':  (HTMLSalesParser, None)
}