src/share/zoneinfo/zishrink.awk - annotate

Return to zishrink.awk CVS log
Up to [local] / src / share / zoneinfo
Annotation of src/share/zoneinfo/zishrink.awk, Revision 1.2

1.1       millert     1: # Convert tzdata source into a smaller version of itself.
                      2:
                      3: # Contributed by Paul Eggert.  This file is in the public domain.
                      4:
                      5: # This is not a general-purpose converter; it is designed for current tzdata.
                      6: # 'zic' should treat this script's output as if it were identical to
                      7: # this script's input.
                      8:
                      9: # Record a hash N for the new name NAME, checking for collisions.
                     10:
                     11: function record_hash(n, name)
                     12: {
                     13:   if (used_hashes[n]) {
                     14:     printf "# ! collision: %s %s\n", used_hashes[n], name
                     15:     exit 1
                     16:   }
                     17:   used_hashes[n] = name
                     18: }
                     19:
                     20: # Return a shortened rule name representing NAME,
                     21: # and record this relationship to the hash table.
                     22:
                     23: function gen_rule_name(name, \
                     24:                       n)
                     25: {
                     26:   # Use a simple mnemonic: the first two letters.
                     27:   n = substr(name, 1, 2)
                     28:   record_hash(n, name)
                     29:   # printf "# %s = %s\n", n, name
                     30:   return n
                     31: }
                     32:
                     33: function prehash_rule_names( \
                     34:                            name)
                     35: {
                     36:   # Rule names are not part of the tzdb API, so substitute shorter
                     37:   # ones.  Shortening them consistently from one release to the next
                     38:   # simplifies comparison of the output.  That being said, the
                     39:   # 1-letter names below are not standardized in any way, and can
                     40:   # change arbitrarily from one release to the next, as the main goal
                     41:   # here is compression not comparison.
                     42:
                     43:   # Abbreviating these rules names to one letter saved the most space
                     44:   # circa 2018e.
                     45:   rule["Arg"] = "A"
                     46:   rule["Brazil"] = "B"
                     47:   rule["Canada"] = "C"
                     48:   rule["Denmark"] = "D"
                     49:   rule["EU"] = "E"
                     50:   rule["France"] = "F"
                     51:   rule["GB-Eire"] = "G"
                     52:   rule["Halifax"] = "H"
                     53:   rule["Italy"] = "I"
                     54:   rule["Jordan"] = "J"
                     55:   rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
                     56:   rule["Libya"] = "L"
                     57:   rule["Morocco"] = "M"
                     58:   rule["Neth"] = "N"
                     59:   rule["Poland"] = "O" # arbitrary
                     60:   rule["Palestine"] = "P"
                     61:   rule["Cuba"] = "Q" # Its start sounds like "Q".
                     62:   rule["Russia"] = "R"
                     63:   rule["Syria"] = "S"
                     64:   rule["Turkey"] = "T"
                     65:   rule["Uruguay"] = "U"
                     66:   rule["Vincennes"] = "V"
                     67:   rule["Winn"] = "W"
                     68:   rule["Mongol"] = "X" # arbitrary
                     69:   rule["NT_YK"] = "Y"
                     70:   rule["Zion"] = "Z"
                     71:   rule["Austria"] = "a"
                     72:   rule["Belgium"] = "b"
                     73:   rule["C-Eur"] = "c"
                     74:   rule["Algeria"] = "d" # country code DZ
                     75:   rule["E-Eur"] = "e"
                     76:   rule["Taiwan"] = "f" # Formosa
                     77:   rule["Greece"] = "g"
                     78:   rule["Hungary"] = "h"
                     79:   rule["Iran"] = "i"
                     80:   rule["StJohns"] = "j"
                     81:   rule["Chatham"] = "k" # arbitrary
                     82:   rule["Lebanon"] = "l"
                     83:   rule["Mexico"] = "m"
                     84:   rule["Tunisia"] = "n" # country code TN
                     85:   rule["Moncton"] = "o" # arbitrary
                     86:   rule["Port"] = "p"
                     87:   rule["Albania"] = "q" # arbitrary
                     88:   rule["Regina"] = "r"
                     89:   rule["Spain"] = "s"
                     90:   rule["Toronto"] = "t"
                     91:   rule["US"] = "u"
                     92:   rule["Louisville"] = "v" # ville
                     93:   rule["Iceland"] = "w" # arbitrary
                     94:   rule["Chile"] = "x" # arbitrary
                     95:   rule["Para"] = "y" # country code PY
                     96:   rule["Romania"] = "z" # arbitrary
                     97:   rule["Macau"] = "_" # arbitrary
                     98:
                     99:   # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
                    100:   # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
                    101:   rule["Armenia"] = "AM"
                    102:   rule["Aus"] = "AU"
                    103:   rule["Azer"] = "AZ"
                    104:   rule["Barb"] = "BB"
                    105:   rule["Dhaka"] = "BD"
                    106:   rule["Bulg"] = "BG"
                    107:   rule["Bahamas"] = "BS"
                    108:   rule["Belize"] = "BZ"
                    109:   rule["Swiss"] = "CH"
                    110:   rule["Cook"] = "CK"
                    111:   rule["PRC"] = "CN"
                    112:   rule["Cyprus"] = "CY"
                    113:   rule["Czech"] = "CZ"
                    114:   rule["Germany"] = "DE"
                    115:   rule["DR"] = "DO"
                    116:   rule["Ecuador"] = "EC"
                    117:   rule["Finland"] = "FI"
                    118:   rule["Fiji"] = "FJ"
                    119:   rule["Falk"] = "FK"
                    120:   rule["Ghana"] = "GH"
                    121:   rule["Guat"] = "GT"
                    122:   rule["Hond"] = "HN"
                    123:   rule["Haiti"] = "HT"
                    124:   rule["Eire"] = "IE"
                    125:   rule["Iraq"] = "IQ"
                    126:   rule["Japan"] = "JP"
                    127:   rule["Kyrgyz"] = "KG"
                    128:   rule["ROK"] = "KR"
                    129:   rule["Latvia"] = "LV"
                    130:   rule["Lux"] = "LX"
                    131:   rule["Moldova"] = "MD"
                    132:   rule["Malta"] = "MT"
                    133:   rule["Mauritius"] = "MU"
                    134:   rule["Namibia"] = "NA"
                    135:   rule["Nic"] = "NI"
                    136:   rule["Norway"] = "NO"
                    137:   rule["Peru"] = "PE"
                    138:   rule["Phil"] = "PH"
                    139:   rule["Pakistan"] = "PK"
                    140:   rule["Sudan"] = "SD"
                    141:   rule["Salv"] = "SV"
                    142:   rule["Tonga"] = "TO"
                    143:   rule["Vanuatu"] = "VU"
                    144:
                    145:   # Avoid collisions.
                    146:   rule["Detroit"] = "Dt" # De = Denver
                    147:
                    148:   for (name in rule) {
                    149:     record_hash(rule[name], name)
                    150:   }
                    151: }
                    152:
                    153: function make_line(n, field, \
                    154:                   f, r)
                    155: {
                    156:   r = field[1]
                    157:   for (f = 2; f <= n; f++)
                    158:     r = r " " field[f]
                    159:   return r
                    160: }
                    161:
                    162: # Process the input line LINE and save it for later output.
                    163:
                    164: function process_input_line(line, \
1.2     ! millert   165:                            f, field, end, n, outline, r, \
1.1       millert   166:                            linkline, ruleline, zoneline)
                    167: {
                    168:   # Remove comments, normalize spaces, and append a space to each line.
                    169:   sub(/#.*/, "", line)
                    170:   line = line " "
                    171:   gsub(/[\t ]+/, " ", line)
                    172:
                    173:   # Abbreviate keywords and determine line type.
                    174:   linkline = sub(/^Link /, "L ", line)
                    175:   ruleline = sub(/^Rule /, "R ", line)
                    176:   zoneline = sub(/^Zone /, "Z ", line)
                    177:
                    178:   # Replace FooAsia rules with the same rules without "Asia", as they
                    179:   # are duplicates.
                    180:   if (match(line, /[^ ]Asia /)) {
                    181:     if (ruleline) return
                    182:     line = substr(line, 1, RSTART) substr(line, RSTART + 5)
                    183:   }
                    184:
                    185:   # Abbreviate times.
                    186:   while (match(line, /[: ]0+[0-9]/))
                    187:     line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
                    188:   while (match(line, /:0[^:]/))
                    189:     line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)
                    190:
                    191:   # Abbreviate weekday names.
                    192:   while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
                    193:     end = RSTART + RLENGTH
                    194:     line = substr(line, 1, end - 4) substr(line, end - 1)
                    195:   }
                    196:   while (match(line, / (last)?(Sun|Tue|Thu|Sat)[ <>]/)) {
                    197:     end = RSTART + RLENGTH
                    198:     line = substr(line, 1, end - 3) substr(line, end - 1)
                    199:   }
                    200:
                    201:   # Abbreviate "max", "min", "only" and month names.
1.2     ! millert   202:   # Although "max" and "min" can both be abbreviated to just "m",
        !           203:   # the longer forms "ma" and "mi" are needed with zic 2023d and earlier.
        !           204:   gsub(/ max /, dataform == "vanguard" ? " m " : " ma ", line)
        !           205:   gsub(/ min /, dataform == "vanguard" ? " m " : " mi ", line)
1.1       millert   206:   gsub(/ only /, " o ", line)
                    207:   gsub(/ Jan /, " Ja ", line)
                    208:   gsub(/ Feb /, " F ", line)
                    209:   gsub(/ Apr /, " Ap ", line)
                    210:   gsub(/ Aug /, " Au ", line)
                    211:   gsub(/ Sep /, " S ", line)
                    212:   gsub(/ Oct /, " O ", line)
                    213:   gsub(/ Nov /, " N ", line)
                    214:   gsub(/ Dec /, " D ", line)
                    215:
                    216:   # Strip leading and trailing space.
                    217:   sub(/^ /, "", line)
                    218:   sub(/ $/, "", line)
                    219:
                    220:   # Remove unnecessary trailing zero fields.
                    221:   sub(/ 0+$/, "", line)
                    222:
                    223:   # Remove unnecessary trailing days-of-month "1".
                    224:   if (match(line, /[A-Za-z] 1$/))
                    225:     line = substr(line, 1, RSTART)
                    226:
                    227:   # Remove unnecessary trailing " Ja" (for January).
                    228:   sub(/ Ja$/, "", line)
                    229:
                    230:   n = split(line, field)
                    231:
                    232:   # Record which rule names are used, and generate their abbreviations.
                    233:   f = zoneline ? 4 : linkline || ruleline ? 0 : 2
                    234:   r = field[f]
                    235:   if (r ~ /^[^-+0-9]/) {
                    236:     rule_used[r] = 1
                    237:   }
                    238:
                    239:   if (zoneline)
                    240:     zonename = startdef = field[2]
                    241:   else if (linkline)
                    242:     zonename = startdef = field[3]
                    243:   else if (ruleline)
                    244:     zonename = ""
                    245:
1.2     ! millert   246:   # Save the information for later output.
        !           247:   outline = make_line(n, field)
        !           248:   if (ruleline)
        !           249:     rule_output_line[nrule_out++] = outline
        !           250:   else if (linkline) {
        !           251:     # In vanguard format with Gawk, links are output sorted by destination.
        !           252:     if (dataform == "vanguard" && PROCINFO["version"])
        !           253:       linkdef[zonename] = field[2]
        !           254:     else
        !           255:       link_output_line[nlink_out++] = outline
        !           256:   }else
        !           257:     zonedef[zonename] = (zoneline ? "" : zonedef[zonename] "\n") outline
1.1       millert   258: }
                    259:
                    260: function omit_unused_rules( \
                    261:                           i, field)
                    262: {
1.2     ! millert   263:   for (i = 0; i < nrule_out; i++) {
        !           264:     split(rule_output_line[i], field)
        !           265:     if (!rule_used[field[2]])
        !           266:       rule_output_line[i] = ""
1.1       millert   267:   }
                    268: }
                    269:
                    270: function abbreviate_rule_names( \
1.2     ! millert   271:                               abbr, f, field, i, n, newdef, newline, r, \
        !           272:                               zoneline, zonelines, zonename)
1.1       millert   273: {
1.2     ! millert   274:   for (i = 0; i < nrule_out; i++) {
        !           275:     n = split(rule_output_line[i], field)
1.1       millert   276:     if (n) {
1.2     ! millert   277:       r = field[2]
1.1       millert   278:       if (r ~ /^[^-+0-9]/) {
                    279:        abbr = rule[r]
                    280:        if (!abbr) {
                    281:          rule[r] = abbr = gen_rule_name(r)
                    282:        }
1.2     ! millert   283:        field[2] = abbr
        !           284:        rule_output_line[i] = make_line(n, field)
1.1       millert   285:       }
                    286:     }
                    287:   }
1.2     ! millert   288:   for (zonename in zonedef) {
        !           289:     zonelines = split(zonedef[zonename], zoneline, /\n/)
        !           290:     newdef = ""
        !           291:     for (i = 1; i <= zonelines; i++) {
        !           292:       newline = zoneline[i]
        !           293:       n = split(newline, field)
        !           294:       f = i == 1 ? 4 : 2
        !           295:       r = rule[field[f]]
        !           296:       if (r) {
        !           297:        field[f] = r
        !           298:        newline = make_line(n, field)
        !           299:       }
        !           300:       newdef = (newdef ? newdef "\n" : "") newline
        !           301:     }
        !           302:     zonedef[zonename] = newdef
        !           303:   }
1.1       millert   304: }
                    305:
                    306: function output_saved_lines( \
1.2     ! millert   307:                            i, zonename)
1.1       millert   308: {
1.2     ! millert   309:   for (i = 0; i < nrule_out; i++)
        !           310:     if (rule_output_line[i])
        !           311:       print rule_output_line[i]
        !           312:
        !           313:   # When using gawk, output zones sorted by name.
        !           314:   # This makes the output a bit more compressible.
        !           315:   PROCINFO["sorted_in"] = "@ind_str_asc"
        !           316:   for (zonename in zonedef)
        !           317:     print zonedef[zonename]
        !           318:
        !           319:   if (nlink_out)
        !           320:     for (i = 0; i < nlink_out; i++)
        !           321:       print link_output_line[i]
        !           322:   else {
        !           323:     # When using gawk, output links sorted by destination.
        !           324:     # This also helps compressibility a bit.
        !           325:     PROCINFO["sorted_in"] = "@val_type_asc"
        !           326:     for (zonename in linkdef)
        !           327:       printf "L %s %s\n", linkdef[zonename], zonename
        !           328:   }
1.1       millert   329: }
                    330:
                    331: BEGIN {
                    332:   # Files that the output normally depends on.
                    333:   default_dep["africa"] = 1
                    334:   default_dep["antarctica"] = 1
                    335:   default_dep["asia"] = 1
                    336:   default_dep["australasia"] = 1
                    337:   default_dep["backward"] = 1
                    338:   default_dep["etcetera"] = 1
                    339:   default_dep["europe"] = 1
                    340:   default_dep["factory"] = 1
                    341:   default_dep["northamerica"] = 1
                    342:   default_dep["southamerica"] = 1
                    343:   default_dep["ziguard.awk"] = 1
                    344:   default_dep["zishrink.awk"] = 1
                    345:
                    346:   # Output a version string from 'version' and related configuration variables
                    347:   # supported by tzdb's Makefile.  If you change the makefile or any other files
                    348:   # that affect the output of this script, you should append '-SOMETHING'
                    349:   # to the contents of 'version', where SOMETHING identifies what was changed.
                    350:
                    351:   ndeps = split(deps, dep)
                    352:   ddeps = ""
                    353:   for (i = 1; i <= ndeps; i++) {
                    354:     if (default_dep[dep[i]]) {
                    355:       default_dep[dep[i]]++
                    356:     } else {
                    357:       ddeps = ddeps " " dep[i]
                    358:     }
                    359:   }
                    360:   for (d in default_dep) {
                    361:     if (default_dep[d] == 1) {
                    362:       ddeps = ddeps " !" d
                    363:     }
                    364:   }
                    365:   print "# version", version
                    366:   if (dataform != "main") {
                    367:     print "# dataform", dataform
                    368:   }
                    369:   if (redo != "posix_right") {
                    370:     print "# redo " redo
                    371:   }
                    372:   if (ddeps) {
                    373:     print "# ddeps" ddeps
                    374:   }
                    375:   print "# This zic input file is in the public domain."
                    376:
                    377:   prehash_rule_names()
                    378: }
                    379:
                    380: /^[\t ]*[^#\t ]/ {
                    381:   process_input_line($0)
                    382: }
                    383:
                    384: END {
                    385:   omit_unused_rules()
                    386:   abbreviate_rule_names()
                    387:   output_saved_lines()
                    388: }