#   abbreviations.py
#      by Hugo Labrande
#   Version: 8-Jul-2021

# This script finds very good abbreviations for your Inform game, which helps when space is tight.
# It finds up to 20% more savings than Inform's "-u" switch ; this is about 4k on a 128k story file.
# As a bonus, it's just as fast as that switch when you look at abbreviations of length between 2 and 9.
# Use the Inform 6.36 version of the compiler for even more savings!

# Other programs doing this exist: https://intfiction.org/t/highly-optimized-abbreviations-computed-efficiently/48753
# Thank you to Henrik Asman and Matthew Russotto for the discussion and their programs. You'll find them at https://github.com/heasm66/ZAbbrevMaker and https://gitlab.com/russotto/zilabbrs

# Abbreviation strings still aren't an exact science!
#   Due to the fact that the number of 5-bit units has to be a multiple of 3 (and as such gets padded), we can only give estimates


# Input: gametext.txt file containing all the text in your game
#    If the flag -f is specified, the input should be the one of Inform 6.35's -r flag with $TRANSCRIPT_FORMAT=1
#    If no flag is specified, it should just be a game text, for instance Inform with -r (default format, $TRANSCRIPT_FORMAT=0), or
#                cat *.zap | grep -o '".*"' | sed 's/"\(.*\)"/\1/g' >gametext.txt
#         (don't forget to exclude the dictionary words though)
#         You then have to tweak how many lines at the beginning and the end you want to skip (see line 70ish of this program)
# Output: If no flag is specified, the abbreviations in Inform's format (use Inform 6.35 with MAX_ABBREVS = 96 and MAX_DYNAMIC_STRINGS = 0)
#         If you specify the -z flag, a file "mygame_freq.zap" will be created in the correct format, ready to use with ZILF/ZILCH.



# Please tweak these constants by hand; I should at some point get around to making these command-line arguments...
#
# File name
GAMETEXT_FILE_NAME = "gametext.txt"
# Verbosity level:
#    0 = nothing is printed except the abbreviations;
#    1 = the script updates you on its progress;
#    2 = the script tells you in excruciating detail what steps it's doing (how many candidates, what the score is, if they change after re-evaluating, etc)
VERBOSITY_LEVEL = 2
# This is used to discard abbreviations that (due to length and frequency) won't save many units
# A high value will speed up this algorithm but you might run out of candidates
MIN_SCORE = 15
#
# This indicates how many abbreviations should be conmputed
# Anything bigger than 64 but smaller than 96 is possible with Inform 6.35, using "MAX_ABBREVS=96; MAX_DYNAMIC_STRINGS=0;"
NUMBER_ABBR = 96
#
# Set a minimum and maximum length (in number of ascii/unicode characters) for abbreviations
# One-char strings can be 4 units long (for instance ";"), so you could in theory save 2 units per occurence; however at the date of writing, Inform refuses to abbreviate strings made of one ASCII characters.
MIN_LEN = 2
MAX_LEN = 60




# Deal with command-line arguments
#   -z = ZILF/ZILCH output
#   -f = new gametext format (Inform 6.35 and up with flag -r and $TRANSCRIPT_FORMAT=1)

ZAP_OUTPUT = 0
NEW_GAMETEXT_FORMAT = 0
import sys, getopt

# TODO allow the specification of the name of the file, the lines, etc - I'm being lazy for now
argv = sys.argv[1:]
try:
    opts, args = getopt.getopt(argv,"zf")
except getopt.GetoptError:
    print ('Usage: python3 abbreviations.py [-z] [-f]')
    sys.exit(2)
for opt, arg in opts:
    if opt == '-z':
        ZAP_OUTPUT = 1
    if opt == '-f':
        NEW_GAMETEXT_FORMAT = 1

# If you opted for the old gametext format, you might want to cut the first few lines (header and abbreviations) and the last few lines (dictionary words) for best results
FIRST_FEW_LINES = 0
LAST_FEW_LINES = 0
if (NEW_GAMETEXT_FORMAT == 0):
    # With I6, skip "6+n" lines, where n is the number of abbreviations you declare
    FIRST_FEW_LINES = 70  
    LAST_FEW_LINES = 259




## Processing starts here

# Helper function (weight of a zchar)
def zchar_weight(c):
    if (ord(c) == 32):
        return 1 ## space = char 0
    elif (ord(c) >= 97 and ord(c) <= 122):
        return 1 ## A0 alphabet
    elif (ord(c) >= 65 and ord(c) <= 90):
        return 2 ## A1 alphabet
    elif (c in "^0123456789.,!?_#'~/\-:()"):
        return 2 ## A2 alphabet
    else:
        return 4


f = open(GAMETEXT_FILE_NAME, "r", encoding='ISO-8859-1')
lines = f.readlines()
# filter out some stuff
if (NEW_GAMETEXT_FORMAT == 1):
    lines2 = []
    for i in range(0, len(lines)):
        cha = lines[i][0]
        # take only if not a comment, an abbreviation, a dictionary word, an attribute name, or a trace message
        if (cha not in "IADSX"):
            # remove the first three characters, the "tag"
            lines2 = lines2 + [lines[i][3:len(lines[i])]]
    lines = lines2
else:
    lines = lines[FIRST_FEW_LINES:len(lines)-LAST_FEW_LINES]

# keep an updated version of the abbreviated text
wholetext = ""
for i in range(0,len(lines)):
    wholetext += lines[i] + "\n"
L = len(wholetext)
wholetext_weight = 0
for i in range(0, len(wholetext)):
    wholetext_weight += zchar_weight(wholetext[i])

l = []
        

for n in range(MIN_LEN,MAX_LEN+1):
    dic = {}
    # each step takes around 1 second on my computer
    if (VERBOSITY_LEVEL > 0):
        print("   Counting frequencies... ("+str(n)+"/"+str(MAX_LEN)+")")
    for li in lines:
        for i in range(0, len(li)-n):
            s = li[i:i+n]
            if (s in dic.keys()):
                dic[s] = dic[s] + 1
            else:
                dic[s] = 1
    
    # First score: naive savings
    
    ## If you want to use the same formula as inform :
    ##     2*((abbrev_freqs[i]-1)*abbrev_quality[i])/3 with abbrev_quality = len -2
    ## A better one is actually counting the units.

    for p in dic.items():
        i = 0
        units = 0
        wd = p[0]
        while (i < len(wd)):
            letter = wd[i]
            if (ord(letter) == 32):
                units += 1 ## space = char 0
            elif (ord(letter) >= 97 and ord(letter) <= 122):
                units += 1 ## A0 alphabet
            elif (ord(letter) >= 65 and ord(letter) <= 90):
                units += 2 ## A1 alphabet
            elif (letter in "^0123456789.,!?_#'~/\-:()"):
                units += 2 ## A2 alphabet
            else:
                if (letter == '@'):
                    ## most likely an accented character like @:e : skip the next 2 letters
                    i+=2
                units += 4 
            i += 1
        ## number of occurences (-1 since you have to write the abbr somewhere) * units saved (units/2) = total units saved
        ## we compute the number of units saved
        ## score = int ((p[1]-1)* (units-2)/3 * 2)
        score = (p[1]-1)* (units-2)
        ## Only add things that save enough units (to speed up the search)
        ## Add something to say when we last updated the score
        if (score > MIN_SCORE):
            l += [[p[0], score, units, 0 ]]

# find the first abbreviation

abbr = []
if (VERBOSITY_LEVEL > 0):
    print("Determining abbreviation "+str(len(abbr)+1)+"...")
l = sorted(l, key=lambda x: x[1])

if ZAP_OUTPUT == 1:
    g = open("mygame_freq.zap", "w")




# given the abbreviation table, what is the optimal way to apply abbreviations for the text?
# this is given in R.A. Wagner , “Common phrases and minimum-space text storage”, Commun. ACM, 16 (3) (1973), pp. 148-152
# this gives us better estimates than just replacing in a text and seeing, because of overlap
def wagner_optimal_parse(text, abblist):
    # dynamic programming algorithm
    # f(n) = "least space needed to store the abbreviated form of the characters n...len(text)"
    N = len(text)
    f = [0 for i in range(0, N+1)]
    # you can compute f(n) from all the f(n+k); your goal is f(1)
    f[N] = 0
    I=N-1
    while(I >= 0):
        # compute f(I) = min( f(I+len(ab))+2, f(I+1)+1) for any matching ab
        val = f[I+1]+zchar_weight(text[I])
        for j in range(0, len(abblist)):
            myabb = abblist[j]
            if (I+len(myabb) <= len(text) and text[I:I+len(myabb)] == myabb):
                # the abbreviation j could be used here, but is it a good idea
                val2 = f[I+len(myabb)]+2
                if (val2 < val):
                    val = val2
        f[I] = val
        I = I-1
    #return the minimal text size with optimal parsing
    return f[0]
    

steps = 0
old_savings = 0
old_textsize = wholetext_weight

while (len(abbr) < NUMBER_ABBR and len(l) > 0):
    steps += 1
    # determine the leaders
    leading_score = l[len(l)-1][1]
    n=1
    while( l[len(l)-1-n][1] == leading_score ):
        n += 1
    if (VERBOSITY_LEVEL == 2):
        print("Found "+str(n)+" leaders with score "+str(leading_score))
    # see if they all have updated score
    flag = 1
    for i in range(1, n+1):
        if (l[len(l)-i][3] != len(abbr)):
            l[len(l)-i][3] = len(abbr)
            # compute how small the text representation would be with that abbreviation added to the set
            # store the difference "without this abbreviation minus with this abbreviation", both computed with optimal parse
            l[len(l)-i][1] = old_textsize-wagner_optimal_parse(wholetext, abbr+[l[len(l)-i][0]])
            flag = 0
    if ( flag == 0 ):
        l = sorted(l, key=lambda x: x[1])
        if (VERBOSITY_LEVEL == 2):
            print("The leaders weren't what we thought; let's sort again...")
    else:
        # at this point, we have a few candidates with equal actual score
        # Matthew Russotto's idea for a tiebreaker: take the high frequency
        #    one (meaning the one with the most weight)
        max = 1
        for i in range(2, n+1):
            if (l[len(l)-max][2] < l[len(l)-i][2]):
                max = i
        # we found our abbreviation
        winner = l[len(l)-max]
        if (VERBOSITY_LEVEL > 0):
            print("Found string : '"+str(winner[0])+"' (units saved: "+str(winner[1])+" units)")
        # the abbreviated text size is decreased by the savings
        old_textsize = old_textsize - winner[1]
        if ZAP_OUTPUT == 1:
            #g.write("        .FSTR FSTR?"+str(len(abbr)+1)+",\""+str(winner[0])+"\"        ; "+str(actual_freq)+"x, saved "+str((actual_freq-1)*(winner[2]-2))+"\n")
            g.write("        .FSTR FSTR?"+str(len(abbr)+1)+",\""+str(winner[0])+"\"        ; "+"\n")
        # the revised score is still better than the next in line's
        abbr = abbr + [str(winner[0])]
        if (VERBOSITY_LEVEL > 0):
            print("Determining abbreviation "+str(len(abbr)+1)+"...")
        # update the array
        lcopy = []
        for i in range(0, len(l)):
            # only add to the copy the strings not containing the abbreviated string
            if (str(winner[0]) not in str(l[i][0])):
                lcopy += [l[i]]
        l = lcopy
        if (VERBOSITY_LEVEL == 2):
            print ("Array updated; now has "+str(len(l))+" entries")
        # no need to sort; the order of the score hasn't changed
        
        
if ZAP_OUTPUT == 1:
    endoffile="WORDS::\n        FSTR?1\n        FSTR?2\n        FSTR?3\n        FSTR?4\n        FSTR?5\n        FSTR?6\n        FSTR?7\n        FSTR?8\n        FSTR?9\n        FSTR?10\n        FSTR?11\n        FSTR?12\n        FSTR?13\n        FSTR?14\n        FSTR?15\n        FSTR?16\n        FSTR?17\n        FSTR?18\n        FSTR?19\n        FSTR?20\n        FSTR?21\n        FSTR?22\n        FSTR?23\n        FSTR?24\n        FSTR?25\n        FSTR?26\n        FSTR?27\n        FSTR?28\n        FSTR?29\n        FSTR?30\n        FSTR?31\n        FSTR?32\n        FSTR?33\n        FSTR?34\n        FSTR?35\n        FSTR?36\n        FSTR?37\n        FSTR?38\n        FSTR?39\n        FSTR?40\n        FSTR?41\n        FSTR?42\n        FSTR?43\n        FSTR?44\n        FSTR?45\n        FSTR?46\n        FSTR?47\n        FSTR?48\n        FSTR?49\n        FSTR?50\n        FSTR?51\n        FSTR?52\n        FSTR?53\n        FSTR?54\n        FSTR?55\n        FSTR?56\n        FSTR?57\n        FSTR?58\n        FSTR?59\n        FSTR?60\n        FSTR?61\n        FSTR?62\n        FSTR?63\n        FSTR?64\n        FSTR?65\n        FSTR?66\n        FSTR?67\n        FSTR?68\n        FSTR?69\n        FSTR?70\n        FSTR?71\n        FSTR?72\n        FSTR?73\n        FSTR?74\n        FSTR?75\n        FSTR?76\n        FSTR?77\n        FSTR?78\n        FSTR?79\n        FSTR?80\n        FSTR?81\n        FSTR?82\n        FSTR?83\n        FSTR?84\n        FSTR?85\n        FSTR?86\n        FSTR?87\n        FSTR?88\n        FSTR?89\n        FSTR?90\n        FSTR?91\n        FSTR?92\n        FSTR?93\n        FSTR?94\n        FSTR?95\n        FSTR?96\n\n        .ENDI"
    g.write(endoffile)
    g.close()

final_savings = wholetext_weight - old_textsize
if (VERBOSITY_LEVEL > 0):
    print("Found "+str(NUMBER_ABBR)+" abbreviations in "+str(steps)+" steps; they saved "+str(final_savings)+" units (~"+str(2*int(final_savings/3))+" bytes)")

s = "Abbreviate "
for i in range(0,NUMBER_ABBR):
    s = s + '"' + abbr[i] +'" '
s += ";"
print(s.encode('ISO-8859-1').decode('UTF-8'))

f.close()