#!/usr/bin/env python
"""
gtparse -- A gettext parsing module in Python
Copyright (C) 2007-2008  Ask Hjorth Larsen <asklarsen@gmail.com>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import sys
import traceback
import codecs
import re
import xml.sax
from optparse import OptionParser


version = '0.15'


class EntrySet:
    """This class represents a list of Entries, and exposes methods for
    managing statistics."""
    def __init__(self, entries, obsoletes=()):
        """Creates a new EntrySet from the provided list of entries."""
        self.entries = list(entries)
        self.obsoletes = list(obsoletes)
        self.__stats__ = None

    def get(self, propertyname):
        """Given a string which is the name of a field of the Entry class,
        returns the list of values of this field for all entries."""
        return [getattr(entry, propertyname) for entry in self.entries]

    def getfuzzy(self):
        """Returns a new EntrySet consisting of all fuzzy Entries."""
        return EntrySet([entry for entry in self.entries
                         if entry.isfuzzy])

    def gettranslated(self):
        """Returns a new EntrySet consisting of all translated Entries."""
        return EntrySet([entry for entry in self.entries
                         if entry.istranslated])

    def getuntranslated(self):
        """Returns a new EntrySet consisting of all untranslated Entries."""
        return EntrySet([entry for entry in self.entries
                         if not entry.isfuzzy and not entry.istranslated])

    def getobsolete(self):
        return EntrySet([], self.obsoletes)

    def stats(self):
        """Returns a lazily-initialized Stats-object for this EntrySet."""
        if self.__stats__ == None:
            self.__stats__ = Stats(self.entries)
        return self.__stats__

    def iter(self):
        """Yield the tuples (entry, msgid, msgstr).
        
        The msgid and msgstrs will loop over plural forms as appropriate
        for the entry."""
        for entry in self.entries:
            yield entry, entry.msgid, entry.msgstr
            for msgstr in entry.msgstrs[1:]:
                yield entry, entry.msgid_plural, msgstr


class PoFile(EntrySet):
    """Represents a po-file. Contains a list of entries plus some high-level
    information pertaining to the header."""
    def __init__(self, lines):
        """Initializes this PoFile from the provided list of lines."""
        entries, obsoletes = parselines(lines)
        EntrySet.__init__(self, entries, obsoletes)
        header = self.entries[0]
        self.headercomments = header.getcomments('# ')
        props = {}
        self.headerproperties = props

        for line in header.msgstr.split('\\n'):
            kv = line.split(':')
            if len(kv) == 2:
                props[kv[0].strip()] = kv[1].strip()

        self.name = props.get('Project-Id-Version')
        self.lasttranslator = props.get('Last-Translator')

class Stats:
    """Class for managing statistics for a list of Entries."""
    def __init__(self, entries):
        """Initializes a number of fields with various statistical
        information about the given list of Entries"""
        fuzzy = untranslated = total = translated = pluralentries = 0
        msgid_chars = msgstr_chars = 0
        msgid_words = msgstr_words = 0
        
        for entry in entries[1:]:
            total += 1
            if entry.istranslated:
                translated += 1
            elif entry.isfuzzy:
                fuzzy += 1
            else:
                untranslated += 1

            msgid_chars += len(entry.msgid)
            msgid_words += len(entry.msgid.split())
            if entry.hasplurals:
                msgid_chars += len(entry.msgid_plural)
                msgid_words += len(entry.msgid_plural.split())
                msgstr_chars += sum([len(string) for string in entry.msgstrs])
                msgstr_words += sum([len(string.split()) for string
                                     in entry.msgstrs])
                pluralentries += 1
            else:
                msgstr_chars += len(entry.msgstr)
                msgstr_words += len(entry.msgstr.split())

        self.fuzzy = fuzzy
        self.untranslated = untranslated
        self.total = total
        self.pluralentries = pluralentries
        self.translated = translated

        self.msgid_chars = msgid_chars
        self.msgstr_chars = msgstr_chars
        self.msgid_words = msgid_words
        self.msgstr_words = msgstr_words

        self.avg_msgid_chars = msgid_chars / total
        self.avg_msgstr_chars = msgstr_chars / total

    def __str__(self):
        keyvalstrings = [''.join([key, ': ', str(val),'\n'])
                         for key, val in self.__dict__.items()]
        keyvalstrings.sort()
        return ''.join(keyvalstrings)
        

class Entry:
    """This class represents a po-file entry. Contains fields that describe:

    * comments (translator-, automatic, reference and flag types)
    * msgid
    * msgstr(s)
    * miscellaneous informations (line count, translation status)
    """

    def __init__(self):
        """This will only initialize all the fields of an Entry object.
        Invoke the 'load' method to load information into it."""
        #self.translatorcomments = [] # Comments starting with '# '
        #self.extractedcomments = [] # Comments starting with '#. '
        #self.references = [] # Comments starting with '#
        #self.flag = []
        self.msgctxt = None
        self.msgid = None
        self.msgid_plural = None
        self.msgstr = None # This is ONLY the first, if there is more than one
        self.msgstrs = []
        self.hasplurals = False
        self.hascontext = False
        self.entryline = None # Line number of first comment
        self.linenumber = None # Line number of msgid
        self.rawlines = [] # A list of the actual lines of this entry
        self.istranslated = False # Translated: not fuzzy, and no empty msgstr
        self.isfuzzy = False # Marked as fuzzy (having possibly empty msgstr)
        
    def load(self, lines, entryline=None):
        """Initializes the variables of this Entry according to the contents
        of the 'lines' parameter.  If entryline is specified, this will be
        stored as the line number of the entry in the po-file.

        Returns False if all lines are comments (such as for obsolete 
        entries), otherwise True."""
        self.entryline = entryline
        self.rawlines = tuple(lines)
        
        # Note: comment order has NOT been verified.
        comments = [line for line in lines if line.startswith('#')]
        self.comments = tuple(comments)
        commentcount = len(comments)

        if commentcount == len(lines):
            return False

        self.isfuzzy = False
        for comment in self.getcomments('#, '):
            if comment.rfind('fuzzy') > 0:
                # There might be trouble with strings that are not translated,
                # but marked as fuzzy nonetheless.
                self.isfuzzy = True

        # Store the actual line number of the msgid
        self.linenumber = self.entryline + commentcount

        index = commentcount
        # Optional context
        self.hascontext = lines[commentcount].startswith('msgctxt ')
        if self.hascontext:
            self.msgctxt, index = extract_string('msgctxt ', lines, index)

        # Next thing should be the msgid
        self.msgid, index = extract_string('msgid ', lines, index)

        # Check for plural entries
        self.hasplurals = lines[index].startswith('msgid_plural ')
        if self.hasplurals:
            self.msgid_plural, index = extract_string('msgid_plural ',
                                                      lines, index)

            plurcount = 0
            while index < len(lines) and lines[index].startswith('msgstr['):
                string, index = extract_string('msgstr['+str(plurcount)+'] ',
                                               lines, index)
                plurcount += 1
                self.msgstrs.append(string)

            self.msgstr = self.msgstrs[0]

        else:
            self.msgstr, index = extract_string('msgstr ', lines, index)
            self.msgstrs = [self.msgstr]


        self.istranslated = (not self.isfuzzy) and \
                            (self.msgstrs.count('') == 0)

        return True

    def getcomments(self, pattern='', strip=False):
        """Return comments, optionally starting with a particular pattern.

        Returns all the comments for this entry that start with the
        given pattern, useful for extracting, say, translator-comments
        ('# '), previous msgid ('#| msgid ') and so on.  Default pattern
        will return all comment strings.  If strip is True,
        the pattern is removed from the returned strings; otherwise pattern 
        is included."""
        striplength = 0
        if strip:
            striplength = len(pattern)
        return [line[striplength:] for line in self.comments 
                if line.startswith(pattern)]


    def __str__(self):
        return ''.join(self.rawlines)

    def copy(self):
        other = Entry()
        other.load(self.rawlines, self.entryline)
        return other


def extract_string(pattern, lines, index=0):
    """Extracts the text of an msgid or msgstr, not including 
    "msgid"/"msgstr", quotation marks or newlines.
    """
    # Rearrange indices
    lines = lines[index:]

    if not lines[0].startswith(pattern):
        raise Exception('Pattern "'+pattern+'" not found at start of string "'
                        + lines[0] + '".')


    lines[0] = lines[0][len(pattern):] # Strip pattern
    msglines = []
    for line in lines:
        if line.startswith('"'):
            msglines.append(line[1:-2]) # Strip quotation marks and newline
        else:
            break

    return ''.join(msglines), index + len(msglines)

def sortcomments(comments):
    """Given a list of strings which must all start with '#', returns a tuple
    containing six lists of strings, namely the translator comments 
    ('# '), extracted comments ('#. '), references ('#: '), flags  ('#, ')
    and comments relating to previous strings ('#| ')."""
    raise DeprecationWarning('use Entry.getcomments(self, pattern, ...)')

    transl = []
    auto = []
    ref = []
    flag = []
    for comment in comments:
        if comment.startswith('#. '):
            auto.append(comment)
        elif comment.startswith('#: '):
            ref.append(comment)
        elif comment.startswith('#, '):
            flag.append(comment)
        elif comment.startswith('#~ '):
            raise Exception('Antiquated comment '+comment)
        elif comment.startswith('#  '):
            transl.append(comment)            

    # Note: comment order has NOT been verified.
    return transl, auto, ref, flag

def grab_sub_string(string, pattern, terminator=None, start=0):
    """From the given string, returns the text enclosed within pattern and
    terminator (which is the start pattern unless otherwise specified).
    The return value is a tuple with the enclosed text, start index and end 
    index.
    """
    startindex = string.index(pattern) + len(pattern)
    if terminator is None:
        terminator = pattern
    endindex = string.index(terminator, startindex)
    
    return (string[startindex:endindex], startindex, endindex)


def parselines(lines):
    """Parses the supplied list of lines, returning a list of Entry-objects."""
    # The plan is to find the empty lines, then make one entry
    # for each chunk between two empty lines.
    # First, however, make sure the file is nice and tidy
    if not lines[-1].endswith('\n'):
        lines[-1] = lines[-1] + '\n'
    if lines[-1] != '\n':
        lines.append('\n')

    whitespacelines = [lnum for lnum, line in enumerate(lines)
                       if line == '\n']
    
    start = 0
    entrychunks = []
    for end in whitespacelines:
        entrychunks.append(lines[start:end])
        start = end + 1
        
    entries = []
    obsoletes = []
    
    # Note: prepend [-1] as a white-space line, since this would
    # logically be  white space by continuation
    for whitelinenum, chunk in zip([-1] + whitespacelines, entrychunks):
        # Entry starts *after* whitespace line, also line numbering starts
        # at 1 while array index starts at 0.  Therefore add 2
        linecount = whitelinenum + 2
        try:
            entry = Entry()
            successful = entry.load(chunk, linecount)
            if successful:
                entries.append(entry)
            else:
                obsoletes.append(entry)
        except:
            traceback.print_exc()
            sys.exit()

    return entries, obsoletes


def colorize(string, id):
    if id is None:
        return string
    return '\x1b[%sm%s\x1b[0m' % (id, string)


class Printer:
    def __init__(self, out):
        self.out = out

    def w(self, string):
        print >> self.out, string,

    def write_entry(self, entry):
        self.write_comments(entry)
        if entry.hascontext:
            self.write_block('msgctxt', entry.msgctxt)
        self.write_block('msgid', entry.msgid)
        if entry.hasplurals:
            self.write_block('msgid_plural', entry.msgid_plural)
            for i, msgstr in enumerate(entry.msgstrs):
                self.write_block('msgstr[%d]' % i, msgstr)
        else:
            self.write_block('msgstr', entry.msgstr)
        self.write_terminator()

    def write_comments(self, entry):
        for comment in entry.comments:
            self.write_comment(comment)

    def write_comment(self, comment):
        self.w(comment)

    def write_block(self, identifier, string):
        self.w('%s "%s"\n' % (identifier, string))

    def write_terminator(self):
        self.w('\n')

col = {'blue': '0;34', 'light red': '1;31', 'light purple': '1;35', 
       'brown': '0;33', 'purple': '0;35', 'yellow': '1;33', 
       'dark gray': '1;30', 'light cyan': '1;36', 'black': '0;30', 
       'light green': '1;32', 'cyan': '0;36', 'green': '0;32', 
       'light blue': '1;34', 'light gray': '0;37', 'white': '1;37', 
       'red': '0;31', None: None}


class Scheme:
    def __init__(self, type, msg, comment_id, comment, notice):
        self.type = col[type]
        self.msg = col[msg]
        self.comment_id = col[comment_id]
        self.comment = col[comment]
        self.notice = col[notice]


schemes = {'greenish' : Scheme('green', 'light blue', 'light cyan', 'cyan', 
                          'yellow'),
           'simple' : Scheme(None, 'red', None, 'blue',
                           'green')}


class PrettyPrinter(Printer):
    def __init__(self, out, scheme):
        Printer.__init__(self, out)
        self.scheme = scheme

    def write_comment(self, comment):
        scheme = self.scheme
        c1, c2 = scheme.comment_id, scheme.comment
        if comment.startswith('#, ') and comment.find('fuzzy') > 0:
            c2 = scheme.notice
        self.w(colorize(comment[:2], c1) + 
               colorize(comment[2:], c2))

    def write_block(self, identifier, string):
        scheme = self.scheme
        secondary = scheme.type
        if string == '':
            secondary = scheme.notice
        identifier = colorize(identifier, secondary)
        if string == 'msgctxt ':
            primary = scheme.notice
        else:
            primary = scheme.msg
        string = colorize(string, scheme.msg)
        quote = colorize('"', secondary)
        self.w('%s %s%s%s\n' % (identifier, quote, string, quote))

def print_pofile(pofile, printer, include_obsolete):
    p = printer
    for entry in pofile.entries:
        p.write_entry(entry)
    if include_obsolete:
        for entry in pofile.obsoletes:
            p.write_comments(entry)
            p.write_terminator()


def build_parser():
    defval = '[default: %default]'

    descr = 'Parse gettext po-files and print/reformat/grep their entries.'
    usage = '%prog [options] [file]'
    parser = OptionParser(description=descr, version=version, usage=usage)

    parser.add_option('-p', '--pipe', action='store_true',
                      help='read from standard input')
    parser.add_option('--stats', action='store_true',
                      help='print stats')

    catopts = parser.add_option_group('Entrywise printing')
    catopts.add_option('-C', '--color', action='store_true',
                      help='print fancy colors')
    catopts.add_option('-S','--color-scheme', metavar='<name>',
                      help='color scheme to use - one of: %s %s'
                      % (', '.join(list(schemes)), defval),
                      dest='scheme', default='simple')
    catopts.add_option('-o', '--exclude-obsolete', action='store_true',
                       dest='exclude_obsolete',
                       help='exclude obsolete entries from output')
    catopts.add_option('-x', '--xml', action='store_true',
                       help='parse xml in strings')
    grepopts = parser.add_option_group('Stringwise printing')
    grepopts.add_option('-i', '--msgid', action='store_true',
                        help='print msgids, one per line')
    grepopts.add_option('-s', '--msgstr', action='store_true',
                        help='print msgstrs, one per line')
    grepopts.add_option('-n', '--line-numbers', action='store_true',
                        help='print line numbers')
    grepopts.add_option('--msgid-contains', metavar='<string>',
                        dest='msgid_pattern',
                        help='print only entries whose msgids contain string')
    grepopts.add_option('--msgstr-contains', metavar='<string>',
                        dest='msgstr_pattern',
                        help='print only entries whose msgstrs contain string')
    grepopts.add_option('-I', '--ignore-case', action='store_true',
                        help='ignore case distinctions when matching patterns')
    return parser

def get_pofiles(opts, args):
    argc = len(args)
    if argc == 0 and not opts.pipe:
        print 'No file to read from, either designate a file or stdin'
        sys.exit(0)
        
    for filename in args:
        pofile = PoFile(codecs.open(filename, 'r', 'utf8').readlines())
        yield pofile
    if opts.pipe:
        yield PoFile([codecs.decode(line, 'utf8') 
                      for line in sys.stdin.readlines()])

def msg_summary(pofile, opts, out):
    if opts.line_numbers:
        fmt = u'%5d: '
        space = u' ' * len(fmt % 0)
        def line_number_format(entry):
            yield fmt % entry.linenumber
            while True:
                yield space
    else:
        initialspacing = 0
        def line_number_format(entry):
            while True:
                yield u''

    #re_flags = 0
    #if opts.ignore_case:
    #    re_flags = 2
    match_msgid = (opts.msgid_pattern is not None)
    match_msgstr = (opts.msgstr_pattern is not None)
    # Apparently, unicode support in Python *sucks*
    if match_msgid:
        msgid_pattern = codecs.decode(opts.msgid_pattern, 'utf8')
    #    msgid_pattern = re.compile(codecs.decode(opts.msgid_pattern, 'utf8'), 
    #                               re_flags)
    if match_msgstr:
        msgstr_pattern =codecs.decode(opts.msgstr_pattern, 'utf8')
    #    msgstr_pattern = re.compile(opts.msgstr_pattern, 
    #                                re_flags)

    if opts.ignore_case:
        def match(pattern, string):
            return pattern.lower() in string.lower()
    else:
        def match(pattern, string):
            return pattern in string
    

    for entry, msgid, msgstr in pofile.iter():
        if match_msgid and not match(msgid_pattern, msgid):
            continue
        if match_msgstr and not match(msgstr_pattern, msgstr):
            continue

        lnf = line_number_format(entry)
        if opts.msgid:
            print >> out, lnf.next() + msgid
        if opts.msgstr:
            print >> out, lnf.next() + msgstr
        if opts.msgid and opts.msgstr:
            print >> out # Add spacing if both are printed


class XMLParser(xml.sax.handler.ContentHandler):
    def __init__(self, entries, printer):
        self.entries = entries
        self.printer = printer

    def filter(self, string):
        if '<' in string:
            xml = u''.join([u'<xml>', 
                            string.replace(u'\\"', u'"'), u'</xml>'])
            return xml.encode('utf8')
        else:
            return ''
        
    def check(self):
        for (entry, msgid, msgstr) in self.entries.iter():
            try:
                xml_msgid = self.filter(msgid)
                xml.sax.parseString(str(xml_msgid), self)
            except xml.sax.SAXParseException, msg:
                if xml_msgid != '':
                    print msg
                    print xml_msgid
                continue # msgid is bad xml, don't bother
            # The msgid is okay - this means we must complain if msgstr isn't
            try:
                xml.sax.parseString(str(self.filter(msgstr)), self)
            except xml.sax.SAXParseException, msg:
                print '--- At line %d: %s' % (entry.linenumber, msg)
                self.printer.write_entry(entry)

def get_printer(out, opts):
    if opts.color:
        return PrettyPrinter(out, schemes[opts.scheme])
    return Printer(out)

def main():
    """Method for testing things."""
    parser = build_parser()
    opts, args = parser.parse_args()
    pofiles = get_pofiles(opts, args)

    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
    out = sys.stdout
    entrywise_mode = not (opts.msgid or opts.msgstr or opts.stats or opts.xml)
    
    for pofile in pofiles:
        if entrywise_mode:
            printer = get_printer(out, opts)
            print_pofile(pofile, printer, not opts.exclude_obsolete)
        elif opts.msgid or opts.msgstr:
            msg_summary(pofile, opts, out)
        elif opts.xml:
            printer = get_printer(out, opts)
            xmlparser = XMLParser(pofile, printer)
            xmlparser.check()

        if opts.stats:
            stats = pofile.stats()

            ftotal = float(stats.total)
            upct = 100 * stats.untranslated / ftotal
            fpct = 100 * stats.fuzzy / ftotal
            tpct = 100 * stats.translated / ftotal

            print
            print 'Statistics'
            print '----------'
            print
            print 'Project:', pofile.name
            print 'Last translator:', pofile.lasttranslator
            print 'Language team:', pofile.headerproperties.get('Language-Team')
            print
            print 'Untranslated      %6d [%3.f%%]' % (stats.untranslated, upct)
            print 'Fuzzy             %6d [%3.f%%]' % (stats.fuzzy, fpct)
            print 'Translated        %6d [%3.f%%]' % (stats.translated, tpct)
            print 'Total             %6d' % stats.total
            print
            print 'Plural entries    %6d' % stats.pluralentries
            print 'Total msgid chars %6d' % stats.msgid_chars
            print 'Total msgstr chars%6d' % stats.msgstr_chars
            print

if __name__ == '__main__':
    main()