# abbreviations.py # by Hugo Labrande # Version: 8-Jul-2021 # This script finds very good abbreviations for your Inform game, which helps when space is tight. # It finds up to 20% more savings than Inform's "-u" switch ; this is about 4k on a 128k story file. # As a bonus, it's just as fast as that switch when you look at abbreviations of length between 2 and 9. # Use the Inform 6.36 version of the compiler for even more savings! # Other programs doing this exist: https://intfiction.org/t/highly-optimized-abbreviations-computed-efficiently/48753 # Thank you to Henrik Asman and Matthew Russotto for the discussion and their programs. You'll find them at https://github.com/heasm66/ZAbbrevMaker and https://gitlab.com/russotto/zilabbrs # Abbreviation strings still aren't an exact science! # Due to the fact that the number of 5-bit units has to be a multiple of 3 (and as such gets padded), we can only give estimates # Input: gametext.txt file containing all the text in your game # If the flag -f is specified, the input should be the one of Inform 6.35's -r flag with $TRANSCRIPT_FORMAT=1 # If no flag is specified, it should just be a game text, for instance Inform with -r (default format, $TRANSCRIPT_FORMAT=0), or # cat *.zap | grep -o '".*"' | sed 's/"\(.*\)"/\1/g' >gametext.txt # (don't forget to exclude the dictionary words though) # You then have to tweak how many lines at the beginning and the end you want to skip (see line 70ish of this program) # Output: If no flag is specified, the abbreviations in Inform's format (use Inform 6.35 with MAX_ABBREVS = 96 and MAX_DYNAMIC_STRINGS = 0) # If you specify the -z flag, a file "mygame_freq.zap" will be created in the correct format, ready to use with ZILF/ZILCH. # Please tweak these constants by hand; I should at some point get around to making these command-line arguments... # # File name GAMETEXT_FILE_NAME = "gametext.txt" # Verbosity level: # 0 = nothing is printed except the abbreviations; # 1 = the script updates you on its progress; # 2 = the script tells you in excruciating detail what steps it's doing (how many candidates, what the score is, if they change after re-evaluating, etc) VERBOSITY_LEVEL = 2 # This is used to discard abbreviations that (due to length and frequency) won't save many units # A high value will speed up this algorithm but you might run out of candidates MIN_SCORE = 15 # # This indicates how many abbreviations should be conmputed # Anything bigger than 64 but smaller than 96 is possible with Inform 6.35, using "MAX_ABBREVS=96; MAX_DYNAMIC_STRINGS=0;" NUMBER_ABBR = 96 # # Set a minimum and maximum length (in number of ascii/unicode characters) for abbreviations # One-char strings can be 4 units long (for instance ";"), so you could in theory save 2 units per occurence; however at the date of writing, Inform refuses to abbreviate strings made of one ASCII characters. MIN_LEN = 2 MAX_LEN = 60 # Deal with command-line arguments # -z = ZILF/ZILCH output # -f = new gametext format (Inform 6.35 and up with flag -r and $TRANSCRIPT_FORMAT=1) ZAP_OUTPUT = 0 NEW_GAMETEXT_FORMAT = 0 import sys, getopt # TODO allow the specification of the name of the file, the lines, etc - I'm being lazy for now argv = sys.argv[1:] try: opts, args = getopt.getopt(argv,"zf") except getopt.GetoptError: print ('Usage: python3 abbreviations.py [-z] [-f]') sys.exit(2) for opt, arg in opts: if opt == '-z': ZAP_OUTPUT = 1 if opt == '-f': NEW_GAMETEXT_FORMAT = 1 # If you opted for the old gametext format, you might want to cut the first few lines (header and abbreviations) and the last few lines (dictionary words) for best results FIRST_FEW_LINES = 0 LAST_FEW_LINES = 0 if (NEW_GAMETEXT_FORMAT == 0): # With I6, skip "6+n" lines, where n is the number of abbreviations you declare FIRST_FEW_LINES = 70 LAST_FEW_LINES = 259 ## Processing starts here # Helper function (weight of a zchar) def zchar_weight(c): if (ord(c) == 32): return 1 ## space = char 0 elif (ord(c) >= 97 and ord(c) <= 122): return 1 ## A0 alphabet elif (ord(c) >= 65 and ord(c) <= 90): return 2 ## A1 alphabet elif (c in "^0123456789.,!?_#'~/\-:()"): return 2 ## A2 alphabet else: return 4 f = open(GAMETEXT_FILE_NAME, "r", encoding='ISO-8859-1') lines = f.readlines() # filter out some stuff if (NEW_GAMETEXT_FORMAT == 1): lines2 = [] for i in range(0, len(lines)): cha = lines[i][0] # take only if not a comment, an abbreviation, a dictionary word, an attribute name, or a trace message if (cha not in "IADSX"): # remove the first three characters, the "tag" lines2 = lines2 + [lines[i][3:len(lines[i])]] lines = lines2 else: lines = lines[FIRST_FEW_LINES:len(lines)-LAST_FEW_LINES] # keep an updated version of the abbreviated text wholetext = "" for i in range(0,len(lines)): wholetext += lines[i] + "\n" L = len(wholetext) wholetext_weight = 0 for i in range(0, len(wholetext)): wholetext_weight += zchar_weight(wholetext[i]) l = [] for n in range(MIN_LEN,MAX_LEN+1): dic = {} # each step takes around 1 second on my computer if (VERBOSITY_LEVEL > 0): print(" Counting frequencies... ("+str(n)+"/"+str(MAX_LEN)+")") for li in lines: for i in range(0, len(li)-n): s = li[i:i+n] if (s in dic.keys()): dic[s] = dic[s] + 1 else: dic[s] = 1 # First score: naive savings ## If you want to use the same formula as inform : ## 2*((abbrev_freqs[i]-1)*abbrev_quality[i])/3 with abbrev_quality = len -2 ## A better one is actually counting the units. for p in dic.items(): i = 0 units = 0 wd = p[0] while (i < len(wd)): letter = wd[i] if (ord(letter) == 32): units += 1 ## space = char 0 elif (ord(letter) >= 97 and ord(letter) <= 122): units += 1 ## A0 alphabet elif (ord(letter) >= 65 and ord(letter) <= 90): units += 2 ## A1 alphabet elif (letter in "^0123456789.,!?_#'~/\-:()"): units += 2 ## A2 alphabet else: if (letter == '@'): ## most likely an accented character like @:e : skip the next 2 letters i+=2 units += 4 i += 1 ## number of occurences (-1 since you have to write the abbr somewhere) * units saved (units/2) = total units saved ## we compute the number of units saved ## score = int ((p[1]-1)* (units-2)/3 * 2) score = (p[1]-1)* (units-2) ## Only add things that save enough units (to speed up the search) ## Add something to say when we last updated the score if (score > MIN_SCORE): l += [[p[0], score, units, 0 ]] # find the first abbreviation abbr = [] if (VERBOSITY_LEVEL > 0): print("Determining abbreviation "+str(len(abbr)+1)+"...") l = sorted(l, key=lambda x: x[1]) if ZAP_OUTPUT == 1: g = open("mygame_freq.zap", "w") # given the abbreviation table, what is the optimal way to apply abbreviations for the text? # this is given in R.A. Wagner , “Common phrases and minimum-space text storage”, Commun. ACM, 16 (3) (1973), pp. 148-152 # this gives us better estimates than just replacing in a text and seeing, because of overlap def wagner_optimal_parse(text, abblist): # dynamic programming algorithm # f(n) = "least space needed to store the abbreviated form of the characters n...len(text)" N = len(text) f = [0 for i in range(0, N+1)] # you can compute f(n) from all the f(n+k); your goal is f(1) f[N] = 0 I=N-1 while(I >= 0): # compute f(I) = min( f(I+len(ab))+2, f(I+1)+1) for any matching ab val = f[I+1]+zchar_weight(text[I]) for j in range(0, len(abblist)): myabb = abblist[j] if (I+len(myabb) <= len(text) and text[I:I+len(myabb)] == myabb): # the abbreviation j could be used here, but is it a good idea val2 = f[I+len(myabb)]+2 if (val2 < val): val = val2 f[I] = val I = I-1 #return the minimal text size with optimal parsing return f[0] steps = 0 old_savings = 0 old_textsize = wholetext_weight while (len(abbr) < NUMBER_ABBR and len(l) > 0): steps += 1 # determine the leaders leading_score = l[len(l)-1][1] n=1 while( l[len(l)-1-n][1] == leading_score ): n += 1 if (VERBOSITY_LEVEL == 2): print("Found "+str(n)+" leaders with score "+str(leading_score)) # see if they all have updated score flag = 1 for i in range(1, n+1): if (l[len(l)-i][3] != len(abbr)): l[len(l)-i][3] = len(abbr) # compute how small the text representation would be with that abbreviation added to the set # store the difference "without this abbreviation minus with this abbreviation", both computed with optimal parse l[len(l)-i][1] = old_textsize-wagner_optimal_parse(wholetext, abbr+[l[len(l)-i][0]]) flag = 0 if ( flag == 0 ): l = sorted(l, key=lambda x: x[1]) if (VERBOSITY_LEVEL == 2): print("The leaders weren't what we thought; let's sort again...") else: # at this point, we have a few candidates with equal actual score # Matthew Russotto's idea for a tiebreaker: take the high frequency # one (meaning the one with the most weight) max = 1 for i in range(2, n+1): if (l[len(l)-max][2] < l[len(l)-i][2]): max = i # we found our abbreviation winner = l[len(l)-max] if (VERBOSITY_LEVEL > 0): print("Found string : '"+str(winner[0])+"' (units saved: "+str(winner[1])+" units)") # the abbreviated text size is decreased by the savings old_textsize = old_textsize - winner[1] if ZAP_OUTPUT == 1: #g.write(" .FSTR FSTR?"+str(len(abbr)+1)+",\""+str(winner[0])+"\" ; "+str(actual_freq)+"x, saved "+str((actual_freq-1)*(winner[2]-2))+"\n") g.write(" .FSTR FSTR?"+str(len(abbr)+1)+",\""+str(winner[0])+"\" ; "+"\n") # the revised score is still better than the next in line's abbr = abbr + [str(winner[0])] if (VERBOSITY_LEVEL > 0): print("Determining abbreviation "+str(len(abbr)+1)+"...") # update the array lcopy = [] for i in range(0, len(l)): # only add to the copy the strings not containing the abbreviated string if (str(winner[0]) not in str(l[i][0])): lcopy += [l[i]] l = lcopy if (VERBOSITY_LEVEL == 2): print ("Array updated; now has "+str(len(l))+" entries") # no need to sort; the order of the score hasn't changed if ZAP_OUTPUT == 1: endoffile="WORDS::\n FSTR?1\n FSTR?2\n FSTR?3\n FSTR?4\n FSTR?5\n FSTR?6\n FSTR?7\n FSTR?8\n FSTR?9\n FSTR?10\n FSTR?11\n FSTR?12\n FSTR?13\n FSTR?14\n FSTR?15\n FSTR?16\n FSTR?17\n FSTR?18\n FSTR?19\n FSTR?20\n FSTR?21\n FSTR?22\n FSTR?23\n FSTR?24\n FSTR?25\n FSTR?26\n FSTR?27\n FSTR?28\n FSTR?29\n FSTR?30\n FSTR?31\n FSTR?32\n FSTR?33\n FSTR?34\n FSTR?35\n FSTR?36\n FSTR?37\n FSTR?38\n FSTR?39\n FSTR?40\n FSTR?41\n FSTR?42\n FSTR?43\n FSTR?44\n FSTR?45\n FSTR?46\n FSTR?47\n FSTR?48\n FSTR?49\n FSTR?50\n FSTR?51\n FSTR?52\n FSTR?53\n FSTR?54\n FSTR?55\n FSTR?56\n FSTR?57\n FSTR?58\n FSTR?59\n FSTR?60\n FSTR?61\n FSTR?62\n FSTR?63\n FSTR?64\n FSTR?65\n FSTR?66\n FSTR?67\n FSTR?68\n FSTR?69\n FSTR?70\n FSTR?71\n FSTR?72\n FSTR?73\n FSTR?74\n FSTR?75\n FSTR?76\n FSTR?77\n FSTR?78\n FSTR?79\n FSTR?80\n FSTR?81\n FSTR?82\n FSTR?83\n FSTR?84\n FSTR?85\n FSTR?86\n FSTR?87\n FSTR?88\n FSTR?89\n FSTR?90\n FSTR?91\n FSTR?92\n FSTR?93\n FSTR?94\n FSTR?95\n FSTR?96\n\n .ENDI" g.write(endoffile) g.close() final_savings = wholetext_weight - old_textsize if (VERBOSITY_LEVEL > 0): print("Found "+str(NUMBER_ABBR)+" abbreviations in "+str(steps)+" steps; they saved "+str(final_savings)+" units (~"+str(2*int(final_savings/3))+" bytes)") s = "Abbreviate " for i in range(0,NUMBER_ABBR): s = s + '"' + abbr[i] +'" ' s += ";" print(s.encode('ISO-8859-1').decode('UTF-8')) f.close()