#!/usr/bin/env python """ gtparse -- A gettext parsing module in Python Copyright (C) 2007-2008 Ask Hjorth Larsen This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import sys import traceback import codecs import re import xml.sax from optparse import OptionParser version = '0.15' class EntrySet: """This class represents a list of Entries, and exposes methods for managing statistics.""" def __init__(self, entries, obsoletes=()): """Creates a new EntrySet from the provided list of entries.""" self.entries = list(entries) self.obsoletes = list(obsoletes) self.__stats__ = None def get(self, propertyname): """Given a string which is the name of a field of the Entry class, returns the list of values of this field for all entries.""" return [getattr(entry, propertyname) for entry in self.entries] def getfuzzy(self): """Returns a new EntrySet consisting of all fuzzy Entries.""" return EntrySet([entry for entry in self.entries if entry.isfuzzy]) def gettranslated(self): """Returns a new EntrySet consisting of all translated Entries.""" return EntrySet([entry for entry in self.entries if entry.istranslated]) def getuntranslated(self): """Returns a new EntrySet consisting of all untranslated Entries.""" return EntrySet([entry for entry in self.entries if not entry.isfuzzy and not entry.istranslated]) def getobsolete(self): return EntrySet([], self.obsoletes) def stats(self): """Returns a lazily-initialized Stats-object for this EntrySet.""" if self.__stats__ == None: self.__stats__ = Stats(self.entries) return self.__stats__ def iter(self): """Yield the tuples (entry, msgid, msgstr). The msgid and msgstrs will loop over plural forms as appropriate for the entry.""" for entry in self.entries: yield entry, entry.msgid, entry.msgstr for msgstr in entry.msgstrs[1:]: yield entry, entry.msgid_plural, msgstr class PoFile(EntrySet): """Represents a po-file. Contains a list of entries plus some high-level information pertaining to the header.""" def __init__(self, lines): """Initializes this PoFile from the provided list of lines.""" entries, obsoletes = parselines(lines) EntrySet.__init__(self, entries, obsoletes) header = self.entries[0] self.headercomments = header.getcomments('# ') props = {} self.headerproperties = props for line in header.msgstr.split('\\n'): kv = line.split(':') if len(kv) == 2: props[kv[0].strip()] = kv[1].strip() self.name = props.get('Project-Id-Version') self.lasttranslator = props.get('Last-Translator') class Stats: """Class for managing statistics for a list of Entries.""" def __init__(self, entries): """Initializes a number of fields with various statistical information about the given list of Entries""" fuzzy = untranslated = total = translated = pluralentries = 0 msgid_chars = msgstr_chars = 0 msgid_words = msgstr_words = 0 for entry in entries[1:]: total += 1 if entry.istranslated: translated += 1 elif entry.isfuzzy: fuzzy += 1 else: untranslated += 1 msgid_chars += len(entry.msgid) msgid_words += len(entry.msgid.split()) if entry.hasplurals: msgid_chars += len(entry.msgid_plural) msgid_words += len(entry.msgid_plural.split()) msgstr_chars += sum([len(string) for string in entry.msgstrs]) msgstr_words += sum([len(string.split()) for string in entry.msgstrs]) pluralentries += 1 else: msgstr_chars += len(entry.msgstr) msgstr_words += len(entry.msgstr.split()) self.fuzzy = fuzzy self.untranslated = untranslated self.total = total self.pluralentries = pluralentries self.translated = translated self.msgid_chars = msgid_chars self.msgstr_chars = msgstr_chars self.msgid_words = msgid_words self.msgstr_words = msgstr_words self.avg_msgid_chars = msgid_chars / total self.avg_msgstr_chars = msgstr_chars / total def __str__(self): keyvalstrings = [''.join([key, ': ', str(val),'\n']) for key, val in self.__dict__.items()] keyvalstrings.sort() return ''.join(keyvalstrings) class Entry: """This class represents a po-file entry. Contains fields that describe: * comments (translator-, automatic, reference and flag types) * msgid * msgstr(s) * miscellaneous informations (line count, translation status) """ def __init__(self): """This will only initialize all the fields of an Entry object. Invoke the 'load' method to load information into it.""" #self.translatorcomments = [] # Comments starting with '# ' #self.extractedcomments = [] # Comments starting with '#. ' #self.references = [] # Comments starting with '# #self.flag = [] self.msgctxt = None self.msgid = None self.msgid_plural = None self.msgstr = None # This is ONLY the first, if there is more than one self.msgstrs = [] self.hasplurals = False self.hascontext = False self.entryline = None # Line number of first comment self.linenumber = None # Line number of msgid self.rawlines = [] # A list of the actual lines of this entry self.istranslated = False # Translated: not fuzzy, and no empty msgstr self.isfuzzy = False # Marked as fuzzy (having possibly empty msgstr) def load(self, lines, entryline=None): """Initializes the variables of this Entry according to the contents of the 'lines' parameter. If entryline is specified, this will be stored as the line number of the entry in the po-file. Returns False if all lines are comments (such as for obsolete entries), otherwise True.""" self.entryline = entryline self.rawlines = tuple(lines) # Note: comment order has NOT been verified. comments = [line for line in lines if line.startswith('#')] self.comments = tuple(comments) commentcount = len(comments) if commentcount == len(lines): return False self.isfuzzy = False for comment in self.getcomments('#, '): if comment.rfind('fuzzy') > 0: # There might be trouble with strings that are not translated, # but marked as fuzzy nonetheless. self.isfuzzy = True # Store the actual line number of the msgid self.linenumber = self.entryline + commentcount index = commentcount # Optional context self.hascontext = lines[commentcount].startswith('msgctxt ') if self.hascontext: self.msgctxt, index = extract_string('msgctxt ', lines, index) # Next thing should be the msgid self.msgid, index = extract_string('msgid ', lines, index) # Check for plural entries self.hasplurals = lines[index].startswith('msgid_plural ') if self.hasplurals: self.msgid_plural, index = extract_string('msgid_plural ', lines, index) plurcount = 0 while index < len(lines) and lines[index].startswith('msgstr['): string, index = extract_string('msgstr['+str(plurcount)+'] ', lines, index) plurcount += 1 self.msgstrs.append(string) self.msgstr = self.msgstrs[0] else: self.msgstr, index = extract_string('msgstr ', lines, index) self.msgstrs = [self.msgstr] self.istranslated = (not self.isfuzzy) and \ (self.msgstrs.count('') == 0) return True def getcomments(self, pattern='', strip=False): """Return comments, optionally starting with a particular pattern. Returns all the comments for this entry that start with the given pattern, useful for extracting, say, translator-comments ('# '), previous msgid ('#| msgid ') and so on. Default pattern will return all comment strings. If strip is True, the pattern is removed from the returned strings; otherwise pattern is included.""" striplength = 0 if strip: striplength = len(pattern) return [line[striplength:] for line in self.comments if line.startswith(pattern)] def __str__(self): return ''.join(self.rawlines) def copy(self): other = Entry() other.load(self.rawlines, self.entryline) return other def extract_string(pattern, lines, index=0): """Extracts the text of an msgid or msgstr, not including "msgid"/"msgstr", quotation marks or newlines. """ # Rearrange indices lines = lines[index:] if not lines[0].startswith(pattern): raise Exception('Pattern "'+pattern+'" not found at start of string "' + lines[0] + '".') lines[0] = lines[0][len(pattern):] # Strip pattern msglines = [] for line in lines: if line.startswith('"'): msglines.append(line[1:-2]) # Strip quotation marks and newline else: break return ''.join(msglines), index + len(msglines) def sortcomments(comments): """Given a list of strings which must all start with '#', returns a tuple containing six lists of strings, namely the translator comments ('# '), extracted comments ('#. '), references ('#: '), flags ('#, ') and comments relating to previous strings ('#| ').""" raise DeprecationWarning('use Entry.getcomments(self, pattern, ...)') transl = [] auto = [] ref = [] flag = [] for comment in comments: if comment.startswith('#. '): auto.append(comment) elif comment.startswith('#: '): ref.append(comment) elif comment.startswith('#, '): flag.append(comment) elif comment.startswith('#~ '): raise Exception('Antiquated comment '+comment) elif comment.startswith('# '): transl.append(comment) # Note: comment order has NOT been verified. return transl, auto, ref, flag def grab_sub_string(string, pattern, terminator=None, start=0): """From the given string, returns the text enclosed within pattern and terminator (which is the start pattern unless otherwise specified). The return value is a tuple with the enclosed text, start index and end index. """ startindex = string.index(pattern) + len(pattern) if terminator is None: terminator = pattern endindex = string.index(terminator, startindex) return (string[startindex:endindex], startindex, endindex) def parselines(lines): """Parses the supplied list of lines, returning a list of Entry-objects.""" # The plan is to find the empty lines, then make one entry # for each chunk between two empty lines. # First, however, make sure the file is nice and tidy if not lines[-1].endswith('\n'): lines[-1] = lines[-1] + '\n' if lines[-1] != '\n': lines.append('\n') whitespacelines = [lnum for lnum, line in enumerate(lines) if line == '\n'] start = 0 entrychunks = [] for end in whitespacelines: entrychunks.append(lines[start:end]) start = end + 1 entries = [] obsoletes = [] # Note: prepend [-1] as a white-space line, since this would # logically be white space by continuation for whitelinenum, chunk in zip([-1] + whitespacelines, entrychunks): # Entry starts *after* whitespace line, also line numbering starts # at 1 while array index starts at 0. Therefore add 2 linecount = whitelinenum + 2 try: entry = Entry() successful = entry.load(chunk, linecount) if successful: entries.append(entry) else: obsoletes.append(entry) except: traceback.print_exc() sys.exit() return entries, obsoletes def colorize(string, id): if id is None: return string return '\x1b[%sm%s\x1b[0m' % (id, string) class Printer: def __init__(self, out): self.out = out def w(self, string): print >> self.out, string, def write_entry(self, entry): self.write_comments(entry) if entry.hascontext: self.write_block('msgctxt', entry.msgctxt) self.write_block('msgid', entry.msgid) if entry.hasplurals: self.write_block('msgid_plural', entry.msgid_plural) for i, msgstr in enumerate(entry.msgstrs): self.write_block('msgstr[%d]' % i, msgstr) else: self.write_block('msgstr', entry.msgstr) self.write_terminator() def write_comments(self, entry): for comment in entry.comments: self.write_comment(comment) def write_comment(self, comment): self.w(comment) def write_block(self, identifier, string): self.w('%s "%s"\n' % (identifier, string)) def write_terminator(self): self.w('\n') col = {'blue': '0;34', 'light red': '1;31', 'light purple': '1;35', 'brown': '0;33', 'purple': '0;35', 'yellow': '1;33', 'dark gray': '1;30', 'light cyan': '1;36', 'black': '0;30', 'light green': '1;32', 'cyan': '0;36', 'green': '0;32', 'light blue': '1;34', 'light gray': '0;37', 'white': '1;37', 'red': '0;31', None: None} class Scheme: def __init__(self, type, msg, comment_id, comment, notice): self.type = col[type] self.msg = col[msg] self.comment_id = col[comment_id] self.comment = col[comment] self.notice = col[notice] schemes = {'greenish' : Scheme('green', 'light blue', 'light cyan', 'cyan', 'yellow'), 'simple' : Scheme(None, 'red', None, 'blue', 'green')} class PrettyPrinter(Printer): def __init__(self, out, scheme): Printer.__init__(self, out) self.scheme = scheme def write_comment(self, comment): scheme = self.scheme c1, c2 = scheme.comment_id, scheme.comment if comment.startswith('#, ') and comment.find('fuzzy') > 0: c2 = scheme.notice self.w(colorize(comment[:2], c1) + colorize(comment[2:], c2)) def write_block(self, identifier, string): scheme = self.scheme secondary = scheme.type if string == '': secondary = scheme.notice identifier = colorize(identifier, secondary) if string == 'msgctxt ': primary = scheme.notice else: primary = scheme.msg string = colorize(string, scheme.msg) quote = colorize('"', secondary) self.w('%s %s%s%s\n' % (identifier, quote, string, quote)) def print_pofile(pofile, printer, include_obsolete): p = printer for entry in pofile.entries: p.write_entry(entry) if include_obsolete: for entry in pofile.obsoletes: p.write_comments(entry) p.write_terminator() def build_parser(): defval = '[default: %default]' descr = 'Parse gettext po-files and print/reformat/grep their entries.' usage = '%prog [options] [file]' parser = OptionParser(description=descr, version=version, usage=usage) parser.add_option('-p', '--pipe', action='store_true', help='read from standard input') parser.add_option('--stats', action='store_true', help='print stats') catopts = parser.add_option_group('Entrywise printing') catopts.add_option('-C', '--color', action='store_true', help='print fancy colors') catopts.add_option('-S','--color-scheme', metavar='', help='color scheme to use - one of: %s %s' % (', '.join(list(schemes)), defval), dest='scheme', default='simple') catopts.add_option('-o', '--exclude-obsolete', action='store_true', dest='exclude_obsolete', help='exclude obsolete entries from output') catopts.add_option('-x', '--xml', action='store_true', help='parse xml in strings') grepopts = parser.add_option_group('Stringwise printing') grepopts.add_option('-i', '--msgid', action='store_true', help='print msgids, one per line') grepopts.add_option('-s', '--msgstr', action='store_true', help='print msgstrs, one per line') grepopts.add_option('-n', '--line-numbers', action='store_true', help='print line numbers') grepopts.add_option('--msgid-contains', metavar='', dest='msgid_pattern', help='print only entries whose msgids contain string') grepopts.add_option('--msgstr-contains', metavar='', dest='msgstr_pattern', help='print only entries whose msgstrs contain string') grepopts.add_option('-I', '--ignore-case', action='store_true', help='ignore case distinctions when matching patterns') return parser def get_pofiles(opts, args): argc = len(args) if argc == 0 and not opts.pipe: print 'No file to read from, either designate a file or stdin' sys.exit(0) for filename in args: pofile = PoFile(codecs.open(filename, 'r', 'utf8').readlines()) yield pofile if opts.pipe: yield PoFile([codecs.decode(line, 'utf8') for line in sys.stdin.readlines()]) def msg_summary(pofile, opts, out): if opts.line_numbers: fmt = u'%5d: ' space = u' ' * len(fmt % 0) def line_number_format(entry): yield fmt % entry.linenumber while True: yield space else: initialspacing = 0 def line_number_format(entry): while True: yield u'' #re_flags = 0 #if opts.ignore_case: # re_flags = 2 match_msgid = (opts.msgid_pattern is not None) match_msgstr = (opts.msgstr_pattern is not None) # Apparently, unicode support in Python *sucks* if match_msgid: msgid_pattern = codecs.decode(opts.msgid_pattern, 'utf8') # msgid_pattern = re.compile(codecs.decode(opts.msgid_pattern, 'utf8'), # re_flags) if match_msgstr: msgstr_pattern =codecs.decode(opts.msgstr_pattern, 'utf8') # msgstr_pattern = re.compile(opts.msgstr_pattern, # re_flags) if opts.ignore_case: def match(pattern, string): return pattern.lower() in string.lower() else: def match(pattern, string): return pattern in string for entry, msgid, msgstr in pofile.iter(): if match_msgid and not match(msgid_pattern, msgid): continue if match_msgstr and not match(msgstr_pattern, msgstr): continue lnf = line_number_format(entry) if opts.msgid: print >> out, lnf.next() + msgid if opts.msgstr: print >> out, lnf.next() + msgstr if opts.msgid and opts.msgstr: print >> out # Add spacing if both are printed class XMLParser(xml.sax.handler.ContentHandler): def __init__(self, entries, printer): self.entries = entries self.printer = printer def filter(self, string): if '<' in string: xml = u''.join([u'', string.replace(u'\\"', u'"'), u'']) return xml.encode('utf8') else: return '' def check(self): for (entry, msgid, msgstr) in self.entries.iter(): try: xml_msgid = self.filter(msgid) xml.sax.parseString(str(xml_msgid), self) except xml.sax.SAXParseException, msg: if xml_msgid != '': print msg print xml_msgid continue # msgid is bad xml, don't bother # The msgid is okay - this means we must complain if msgstr isn't try: xml.sax.parseString(str(self.filter(msgstr)), self) except xml.sax.SAXParseException, msg: print '--- At line %d: %s' % (entry.linenumber, msg) self.printer.write_entry(entry) def get_printer(out, opts): if opts.color: return PrettyPrinter(out, schemes[opts.scheme]) return Printer(out) def main(): """Method for testing things.""" parser = build_parser() opts, args = parser.parse_args() pofiles = get_pofiles(opts, args) sys.stdout = codecs.getwriter('utf8')(sys.stdout) out = sys.stdout entrywise_mode = not (opts.msgid or opts.msgstr or opts.stats or opts.xml) for pofile in pofiles: if entrywise_mode: printer = get_printer(out, opts) print_pofile(pofile, printer, not opts.exclude_obsolete) elif opts.msgid or opts.msgstr: msg_summary(pofile, opts, out) elif opts.xml: printer = get_printer(out, opts) xmlparser = XMLParser(pofile, printer) xmlparser.check() if opts.stats: stats = pofile.stats() ftotal = float(stats.total) upct = 100 * stats.untranslated / ftotal fpct = 100 * stats.fuzzy / ftotal tpct = 100 * stats.translated / ftotal print print 'Statistics' print '----------' print print 'Project:', pofile.name print 'Last translator:', pofile.lasttranslator print 'Language team:', pofile.headerproperties.get('Language-Team') print print 'Untranslated %6d [%3.f%%]' % (stats.untranslated, upct) print 'Fuzzy %6d [%3.f%%]' % (stats.fuzzy, fpct) print 'Translated %6d [%3.f%%]' % (stats.translated, tpct) print 'Total %6d' % stats.total print print 'Plural entries %6d' % stats.pluralentries print 'Total msgid chars %6d' % stats.msgid_chars print 'Total msgstr chars%6d' % stats.msgstr_chars print if __name__ == '__main__': main()