#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys, locale

#locale.setlocale(locale.LC_ALL, ("en_DK", None))
#That won't work, so until it does we'll misuse UPPER and LOWER as defined below
#instead of those in the python string API
#Normally these things should be accessed from string.uppercase etc.
UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅ'
LOWER = 'abcdefghijklmnopqrstuvwxyzæøå'

DOCUMENTATION='\n'\
'USAGE: poabc.py fileName.po\n'\
'\n'\
'poabc - PO-file Automatic Blunder Corrector for v0.3 beta\n'\
'\n'\
'This utility parses a .po translation file, writing suspected errors to\n'\
'standard output.  Errors are suspected whenever inconsistences between\n'\
'msgid and msgstr are detected in terms of case or type of the leading\n'\
'characters, number or type of trailing non-alphabetic characters (such as\n'\
'punctuation) or number of hotkey assignments (underscores).\n'\
'\n'\
'Written by Ask Hjorth Larsen <asklarsen@gmail.com>\n'\
'Thanks to Kenneth Nielsen for miscellaneous programming and testing\n'\
'Mar 22, 2007.\n'

"""
-----------------------
Version history
-----------------------
v0.3
Never complains about the msgid "translator-credits"
Does not complain about quotation mark conversions from ' to \"
Prints total warning count
Improved output format
Prints summary of warnings and string counts

v0.2
Prints line numbers of warnings
Warns about untranslated strings
Supports singular/plural entry syntax
Warns about unsupported syntax
Improved robustness in general

v0.1
Compare case, leading/trailing characters, whitespace
"""

#Used in place of locale-dependent string.isalpha()
def isalpha(char):
    return isupper(char) | islower(char)

#Used in place of locale-dependent string.isupper()
def isupper(char):
    return UPPER.find(char) != -1

#Used in place of locale-dependent string.islower()
def islower(char):
    return LOWER.find(char) != -1

"""
Compares the first character in each of the two specified strings and
returns False if an error is suspected, otherwise True.

An error is suspected unless one of the following is true:
    1) both strings start with an uppercase letter
    2) both strings start with a lowercase letter
    3) both strings start with the same non-alphabetic character
"""
def compareCase(msgid, msgstr):
    if msgid == '' or msgstr == '':
        return msgid == msgstr
    char1 = msgid[0]
    char2 = msgstr[0]

    returnValue = True

    #If alphabetic characters, compare case.
    if isalpha(char1) and isalpha(char2):
        bothUpper = isupper(char1) and isupper(char2)
        bothLower = islower(char1) and islower(char2)
        return bothUpper | bothLower
    else:
        #Non-alphabetic characters. These should probably be identical.

        return char1 == char2

"""
Returns a list of the lines starting with '#' beginning at the specified index
"""
def readComments(lines, index):
    startIndex = index
    maxIndex = len(lines)
    comments = []
    while (index < maxIndex) and (lines[index].startswith('#') or lines[index].isspace()):
        comments.append(lines[index])
        index += 1
    return comments, index

def readString(lines, index, qualifier):
    maxIndex = len(lines)
    relevantLines = []
    if lines[index].startswith(qualifier):
        #Remove the qualifier, add remainder to relevant lines
        #Adjust for leading quote and trailing quote/newline
        relevantLines.append(lines[index][len(qualifier)+1:-2])
        index += 1
        
        while (index < maxIndex) and lines[index].startswith('"'):
            #Also adjust for leading quote and trailing quote/newline
            relevantLines.append(lines[index][1:-2])
            index += 1
            
    string = ''.join(relevantLines)
    return (string, index)

def readPluralmsgid(lines, index):
    return readString(lines, index, 'msgid_plural ')

def readmsgid(lines, index):
    return readString(lines, index, 'msgid ')

def readmsgstr(lines, index):
    return readString(lines, index, 'msgstr ')

def skipWhiteSpace(lines, index):
    maxIndex = len(lines)
    while (index < maxIndex) and lines[index].isspace():
        index += 1
    return index

"""
These variables are used by the readEntry function in a spaghetti-like way.
They are meant to keep track of plural forms, ensuring that they are returned
sequentially (which would be difficult to do using only one index variable).
The variables are modified on subsequent readEntry() calls
"""
pluralmsgid = None
pluralFormEntry = False
pluralFormCount = 0

"""
Takes a list of strings and a line index as parameters.
Returns a quadruple containing the list of comments, msgid, msgstr found in the
strings, along with the line index after the msgstr terminates.

Subsequent calls of this method using the returned index will return
sequential msgid-msgstr pairs.

The returned strings have newlines and quotation marks removed except those
explicitly declared in the string, and do not contain the "msgid" or "msgstr"
declarations.

Plural forms of msgstr will be returned one after another on subsequent calls,
together with the plural msgid. The singular msgstr will be returned with
the singular msgid.
"""
def readEntry(lines, index):

    global pluralmsgid, pluralFormEntry, pluralFormCount
    
    startIndex = index
    (comments,index) = readComments(lines, index)
    
    if index >= len(lines):
        #Just return None if at end of file, parsing will stop
        return (None, None,None,index)

    if pluralFormEntry:
        #Plural forms!
        #Relevant msgid stored globally from earlier invocation
        msgid = pluralmsgid
    else:
        #As long as we're not working with plural forms, we want a new msgid
        (msgid, index) = readmsgid(lines, index)

    #Check for plural forms. Indexing is okay since file cannot end here
    if lines[index].startswith('msgid_plural'):
        #Remember that we are now working with plural forms
        pluralFormEntry = True
        #The current msgid is the singular form
        singularmsgid = msgid
        #Read the plural version of msgid
        (plurString, index) = readString(lines, index, 'msgid_plural ')
        pluralmsgid = plurString #Store for subsequent invocations

        #Read the singular translation and compare to singular msgid
        (msgstr, index) = readString(lines, index, 'msgstr[0] ')
        pluralFormCount = 1
        #Now msgid and msgstr refer to singular forms, and we're done
        
    elif pluralFormEntry: #We are already working with plural forms
        #Make sure there are more plural forms
        if lines[index].startswith('msgstr['):
            msgid = pluralmsgid #stored from earlier
            (msgstr, index) = readString(lines, index, 'msgstr['+str(pluralFormCount)+'] ')
            pluralFormCount += 1
        else:
            #There are no more plural forms, so reset the variables
            pluralFormEntry = False
            pluralFormCount = 0
            pluralmsgid = None
            #Just return, the method will be invoked again and move on.
            return (None, None, None, index)
            
    else: #normal procedure - find msgstr
        (msgstr, index) = readmsgstr(lines, index)

    if startIndex == index:
        #No sensible strings were found, but we have to move on.
        #Skip to next line
        print '---Unsupported syntax, skipping line',index,'---'
        index += 1
        return (None, None, None, index)
    index = skipWhiteSpace(lines, index)
    return (comments, msgid, msgstr, index)

"""
Compares the trailing characters of the two specified strings. Returns False
if an error is suspected, otherwise True.

An error is suspected unless all trailing non-alphabetic characters are
identical.
"""
def compareTrailingChars(msgid, msgstr):
    index = 0
    minIndex = - min( [len(msgid), len(msgstr)] )
    hasalpha = False
    consistent = True
    while (index > minIndex) and (not hasalpha) and consistent:
        index = index - 1
        idAlpha = isalpha(msgid[index])
        strAlpha = isalpha(msgstr[index])
        hasalpha = idAlpha or strAlpha
        consistent = ((msgid[index] == msgstr[index]) or (idAlpha and strAlpha))

    return consistent

"""
Compares the hotkey designations of the two specified strings.
Returns a triple consisting of the two input strings with any underscores
removed, and a boolean which is False if an error is suspected, otherwise True.

An error is suspected if and only if differing numbers of underscores occur
in the specified strings.
"""
def checkHotkeys(msgid, msgstr):
    idCount = msgid.count('_')
    strCount = msgstr.count('_')

    msgidNoKey = msgid.replace('_','')
    msgstrNoKey = msgstr.replace('_','')
    
    return msgidNoKey, msgstrNoKey, (idCount == strCount)

"""
Takes a msgid string, then replaces single quotation marks with double
quotation marks such that the trailing/leading character analysis will
not mark different use of quotation marks as an error, unless the msgstr
uses single quotation marks.
"""
def hackQuotationMarkConversion(msgid):
    return msgid.replace("'", '\\"')

def makeErrMsg(msg, index):
    return '=== Line '+str(index-1)+' : '+msg+' ==='
    
"""
Parses the given list of strings for suspected errors. The strings are assumed
to be in po-file format, each string being one line, and each string being
terminated by a newline character.

Suspected errors will be written to standard output.
"""
def parse(lines):
    maxIndex = len(lines)
    index = 0
    stringCount = 0
    #We're skipping the first one in a moment
    #However the official list of .po-files seems to do this as well, so we'll
    #just ignore the entire issue and set stringCount = 0
    warningCount = 0
    fuzzy = 0
    untranslated = 0

    #REMEMBER: make sure msgid_plural is not counted to conform with
    #l10n.gnome.org
    
    #The first entry contains metadata and should not be counted
    (comments, msgid, msgstr, index) = readEntry(lines, index)

    #Now parse all the other entries until no more exist
    while index < maxIndex:
        (comments, msgid, msgstr, index) = readEntry(lines, index)

        if msgid == None or msgstr == None:
            continue #end of file or deliberately unsupported syntax

        stringCount += 1
        
        if msgid == '':
            print makeErrMsg('Empty string for translation?', index)
            print
            warningCount += 1
            continue
        if msgstr == '':
            print makeErrMsg('Untranslated string', index)
            print msgid
            print
            untranslated += 1
            warningCount += 1
            continue
        #Special string which we should ignore
        if msgid == 'translator-credits':
            continue

        #Make sure no warnings are issued just because of bad quotation mark use
        msgidConvQuotes = hackQuotationMarkConversion(msgid)
        
        #Check hotkey assignments, then remove hotkey chars to
        #make the strings parseable by the other functions
        (msgidNoKey, msgstrNoKey, hotkey) = checkHotkeys(msgidConvQuotes, msgstr)

        #Booleans indicating possible errors
        case = compareCase(msgidNoKey, msgstrNoKey)
        punc = compareTrailingChars(msgidNoKey, msgstrNoKey)

        #Print the current string if errors are found
        printStrings = False

        #Check whether string is fuzzy
        for comment in comments:
            if comment.startswith('#,') and comment.find('fuzzy') != -1:
                fuzzy += 1
                print makeErrMsg('Fuzzy string', index)
                printStrings = True
        
        if not case:
            printStrings = True
            print makeErrMsg('Leading character type or case mismatch',index)
        if not punc:
            printStrings = True
            if (msgid[-1].isspace() ^ msgstr[-1].isspace()):
                print makeErrMsg('Trailing whitespace inconsistency', index)
            else:
                print makeErrMsg('Trailing characters or punctuation mismatch',index)
        if not hotkey:
            printStrings = True
            print makeErrMsg('Hotkey assignment inconsistency', index)

        if printStrings:
            print 'msgid "'+msgid+'"'
            print 'msgstr "'+msgstr+'"'
            print
            warningCount += 1

    print '================ Summary ================'
    print 'Total string count:', stringCount
    translatedRatio = str(100 * (stringCount-fuzzy-untranslated) / stringCount)
    print 'Fully translated string count:',stringCount-untranslated-fuzzy,'('+translatedRatio+'%)'
    if fuzzy > 0:
        fuzzyRatio = str(100 * fuzzy / stringCount)
        print 'Fuzzy string count:', fuzzy,'('+fuzzyRatio+'%)'
    if untranslated > 0:
        untransRatio = str(100 * untranslated / stringCount)
        print 'Untranslated string count:', untranslated,'('+untransRatio+'%)'
    print 'Total warning count:', warningCount
    print '========================================='

        
"""
Parses the .po-file given by the first command-line parameter for syntax
errors, or prints help if parameters are malformed or omitted
"""
def main():
    argCount = len(sys.argv)
    if argCount == 2:
        fileName = sys.argv[1]
        file = open(fileName)
        lines = file.readlines()
        parse(lines)
    elif argCount == 1:
        print DOCUMENTATION
    elif argCount > 2:
        print 'Received too many arguments'
        print DOCUMENTATION

if __name__ == '__main__':
    main()