[BACK]Return to charset.c CVS log [TXT][DIR] Up to [local] / src / usr.bin / less

Annotation of src/usr.bin/less/charset.c, Revision 1.6

1.1       etheisen    1: /*
1.5       millert     2:  * Copyright (C) 1984-2002  Mark Nudelman
1.1       etheisen    3:  *
1.5       millert     4:  * You may distribute under the terms of either the GNU General Public
                      5:  * License or the Less License, as specified in the README file.
1.1       etheisen    6:  *
1.5       millert     7:  * For more information about less, or for information on how to
                      8:  * contact the author, see the README file.
1.1       etheisen    9:  */
                     10:
                     11:
                     12: /*
                     13:  * Functions to define the character set
                     14:  * and do things specific to the character set.
                     15:  */
                     16:
                     17: #include "less.h"
                     18: #if HAVE_LOCALE
                     19: #include <locale.h>
                     20: #include <ctype.h>
                     21: #endif
                     22:
1.5       millert    23: public int utf_mode = 0;
                     24:
1.6     ! millert    25: #if !SMALL
1.1       etheisen   26: /*
                     27:  * Predefined character sets,
                     28:  * selected by the LESSCHARSET environment variable.
                     29:  */
                     30: struct charset {
                     31:        char *name;
1.5       millert    32:        int *p_flag;
1.1       etheisen   33:        char *desc;
                     34: } charsets[] = {
1.5       millert    35:        { "ascii",      NULL,       "8bcccbcc18b95.b" },
                     36:        { "dos",        NULL,       "8bcccbcc12bc5b223.b" },
                     37:        { "ebcdic",     NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
                     38:        { "IBM-1047",   NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
                     39:        { "iso8859",    NULL,       "8bcccbcc18b95.33b." },
                     40:        { "koi8-r",     NULL,       "8bcccbcc18b95.b128." },
                     41:        { "next",       NULL,       "8bcccbcc18b95.bb125.bb" },
                     42:        { "utf-8",      &utf_mode,  "8bcccbcc18b." },
                     43:        { NULL, NULL, NULL }
                     44: };
                     45:
                     46: struct cs_alias {
                     47:        char *name;
                     48:        char *oname;
                     49: } cs_aliases[] = {
                     50:        { "latin1",     "iso8859" },
                     51:        { "latin9",     "iso8859" },
                     52:        { NULL, NULL }
1.1       etheisen   53: };
                     54:
                     55: #define        IS_BINARY_CHAR  01
                     56: #define        IS_CONTROL_CHAR 02
                     57:
                     58: static char chardef[256];
                     59: static char *binfmt = NULL;
                     60: public int binattr = AT_STANDOUT;
                     61:
                     62:
                     63: /*
                     64:  * Define a charset, given a description string.
                     65:  * The string consists of 256 letters,
                     66:  * one for each character in the charset.
                     67:  * If the string is shorter than 256 letters, missing letters
                     68:  * are taken to be identical to the last one.
                     69:  * A decimal number followed by a letter is taken to be a
                     70:  * repetition of the letter.
                     71:  *
                     72:  * Each letter is one of:
                     73:  *     . normal character
                     74:  *     b binary character
                     75:  *     c control character
                     76:  */
                     77:        static void
                     78: ichardef(s)
                     79:        char *s;
                     80: {
1.5       millert    81:        register char *cp;
                     82:        register int n;
                     83:        register char v;
1.1       etheisen   84:
                     85:        n = 0;
                     86:        v = 0;
                     87:        cp = chardef;
                     88:        while (*s != '\0')
                     89:        {
                     90:                switch (*s++)
                     91:                {
                     92:                case '.':
                     93:                        v = 0;
                     94:                        break;
                     95:                case 'c':
                     96:                        v = IS_CONTROL_CHAR;
                     97:                        break;
                     98:                case 'b':
                     99:                        v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
                    100:                        break;
                    101:
                    102:                case '0': case '1': case '2': case '3': case '4':
                    103:                case '5': case '6': case '7': case '8': case '9':
                    104:                        n = (10 * n) + (s[-1] - '0');
                    105:                        continue;
                    106:
                    107:                default:
                    108:                        error("invalid chardef", NULL_PARG);
                    109:                        quit(QUIT_ERROR);
                    110:                        /*NOTREACHED*/
                    111:                }
                    112:
                    113:                do
                    114:                {
                    115:                        if (cp >= chardef + sizeof(chardef))
                    116:                        {
                    117:                                error("chardef longer than 256", NULL_PARG);
                    118:                                quit(QUIT_ERROR);
                    119:                                /*NOTREACHED*/
                    120:                        }
                    121:                        *cp++ = v;
                    122:                } while (--n > 0);
                    123:                n = 0;
                    124:        }
                    125:
                    126:        while (cp < chardef + sizeof(chardef))
                    127:                *cp++ = v;
                    128: }
                    129:
                    130: /*
                    131:  * Define a charset, given a charset name.
                    132:  * The valid charset names are listed in the "charsets" array.
                    133:  */
                    134:        static int
                    135: icharset(name)
1.5       millert   136:        register char *name;
1.1       etheisen  137: {
1.5       millert   138:        register struct charset *p;
                    139:        register struct cs_alias *a;
1.1       etheisen  140:
                    141:        if (name == NULL || *name == '\0')
                    142:                return (0);
                    143:
1.5       millert   144:        /* First see if the name is an alias. */
                    145:        for (a = cs_aliases;  a->name != NULL;  a++)
                    146:        {
                    147:                if (strcmp(name, a->name) == 0)
                    148:                {
                    149:                        name = a->oname;
                    150:                        break;
                    151:                }
                    152:        }
                    153:
1.1       etheisen  154:        for (p = charsets;  p->name != NULL;  p++)
                    155:        {
                    156:                if (strcmp(name, p->name) == 0)
                    157:                {
                    158:                        ichardef(p->desc);
1.5       millert   159:                        if (p->p_flag != NULL)
                    160:                                *(p->p_flag) = 1;
1.1       etheisen  161:                        return (1);
                    162:                }
                    163:        }
                    164:
                    165:        error("invalid charset name", NULL_PARG);
                    166:        quit(QUIT_ERROR);
                    167:        /*NOTREACHED*/
1.5       millert   168:        return (0);
1.1       etheisen  169: }
                    170:
                    171: #if HAVE_LOCALE
                    172: /*
                    173:  * Define a charset, given a locale name.
                    174:  */
                    175:        static void
                    176: ilocale()
                    177: {
1.5       millert   178:        register int c;
1.1       etheisen  179:
1.5       millert   180:        setlocale(LC_ALL, "");
                    181:        for (c = 0;  c < (int) sizeof(chardef);  c++)
1.1       etheisen  182:        {
                    183:                if (isprint(c))
                    184:                        chardef[c] = 0;
                    185:                else if (iscntrl(c))
                    186:                        chardef[c] = IS_CONTROL_CHAR;
                    187:                else
                    188:                        chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
                    189:        }
                    190: }
                    191: #endif
                    192:
                    193: /*
                    194:  * Define the printing format for control chars.
                    195:  */
                    196:        public void
                    197: setbinfmt(s)
                    198:        char *s;
                    199: {
                    200:        if (s == NULL || *s == '\0')
                    201:                s = "*s<%X>";
                    202:        /*
                    203:         * Select the attributes if it starts with "*".
                    204:         */
                    205:        if (*s == '*')
                    206:        {
                    207:                switch (s[1])
                    208:                {
                    209:                case 'd':  binattr = AT_BOLD;      break;
                    210:                case 'k':  binattr = AT_BLINK;     break;
                    211:                case 's':  binattr = AT_STANDOUT;  break;
                    212:                case 'u':  binattr = AT_UNDERLINE; break;
                    213:                default:   binattr = AT_NORMAL;    break;
                    214:                }
                    215:                s += 2;
                    216:        }
                    217:        binfmt = s;
                    218: }
                    219:
                    220: /*
                    221:  * Initialize charset data structures.
                    222:  */
                    223:        public void
                    224: init_charset()
                    225: {
1.5       millert   226:        register char *s;
1.1       etheisen  227:
1.5       millert   228:        s = lgetenv("LESSBINFMT");
1.1       etheisen  229:        setbinfmt(s);
                    230:
                    231:        /*
                    232:         * See if environment variable LESSCHARSET is defined.
                    233:         */
1.5       millert   234:        s = lgetenv("LESSCHARSET");
1.1       etheisen  235:        if (icharset(s))
                    236:                return;
                    237:        /*
                    238:         * LESSCHARSET is not defined: try LESSCHARDEF.
                    239:         */
1.5       millert   240:        s = lgetenv("LESSCHARDEF");
1.1       etheisen  241:        if (s != NULL && *s != '\0')
                    242:        {
                    243:                ichardef(s);
                    244:                return;
                    245:        }
1.5       millert   246:
                    247: #if HAVE_STRSTR
                    248:        /*
                    249:         * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
                    250:         */
                    251:        if ((s = lgetenv("LC_ALL")) != NULL ||
                    252:            (s = lgetenv("LC_CTYPE")) != NULL ||
                    253:            (s = lgetenv("LANG")) != NULL)
                    254:        {
                    255:                if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
                    256:                        if (icharset("utf-8"))
                    257:                                return;
                    258:        }
                    259: #endif
                    260:
1.1       etheisen  261: #if HAVE_LOCALE
                    262:        /*
                    263:         * Use setlocale.
                    264:         */
                    265:        ilocale();
                    266: #else
1.5       millert   267: #if MSDOS_COMPILER
1.1       etheisen  268:        /*
1.5       millert   269:         * Default to "dos".
1.1       etheisen  270:         */
1.5       millert   271:        (void) icharset("dos");
                    272: #else
                    273:        /*
                    274:         * Default to "latin1".
                    275:         */
                    276:        (void) icharset("latin1");
                    277: #endif
1.1       etheisen  278: #endif
                    279: }
                    280:
                    281: /*
                    282:  * Is a given character a "binary" character?
                    283:  */
                    284:        public int
                    285: binary_char(c)
1.5       millert   286:        unsigned char c;
1.1       etheisen  287: {
                    288:        c &= 0377;
                    289:        return (chardef[c] & IS_BINARY_CHAR);
                    290: }
                    291:
                    292: /*
                    293:  * Is a given character a "control" character?
                    294:  */
                    295:        public int
                    296: control_char(c)
                    297:        int c;
                    298: {
                    299:        c &= 0377;
                    300:        return (chardef[c] & IS_CONTROL_CHAR);
                    301: }
                    302:
                    303: /*
                    304:  * Return the printable form of a character.
                    305:  * For example, in the "ascii" charset '\3' is printed as "^C".
                    306:  */
                    307:        public char *
                    308: prchar(c)
                    309:        int c;
                    310: {
                    311:        static char buf[8];
                    312:
                    313:        c &= 0377;
                    314:        if (!control_char(c))
1.5       millert   315:                snprintf(buf, sizeof(buf), "%c", c);
1.1       etheisen  316:        else if (c == ESC)
1.5       millert   317:                snprintf(buf, sizeof(buf), "ESC");
                    318: #if IS_EBCDIC_HOST
                    319:        else if (!binary_char(c) && c < 64)
                    320:                snprintf(buf, sizeof(buf), "^%c",
                    321:                /*
                    322:                 * This array roughly inverts CONTROL() #defined in less.h,
                    323:                 * and should be kept in sync with CONTROL() and IBM-1047.
                    324:                 */
                    325:                "@ABC.I.?...KLMNO"
                    326:                "PQRS.JH.XY.."
                    327:                "\\]^_"
                    328:                "......W[.....EFG"
                    329:                "..V....D....TU.Z"[c]);
                    330: #else
                    331:        else if (c < 128 && !control_char(c ^ 0100))
                    332:                snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
                    333: #endif
1.1       etheisen  334:        else
1.5       millert   335:                snprintf(buf, sizeof(buf), binfmt, c);
1.1       etheisen  336:        return (buf);
                    337: }
1.6     ! millert   338:
        !           339: #else /* SMALL */
        !           340:
        !           341: public int binattr = AT_STANDOUT;
        !           342:
        !           343:        public void
        !           344: init_charset()
        !           345: {
        !           346:        return;
        !           347: }
        !           348:
        !           349: /*
        !           350:  * Is a given character a "binary" character?
        !           351:  */
        !           352:        public int
        !           353: binary_char(c)
        !           354:        unsigned char c;
        !           355: {
        !           356:        return (!isprint(c));
        !           357: }
        !           358:
        !           359: /*
        !           360:  * Is a given character a "control" character?
        !           361:  */
        !           362:        public int
        !           363: control_char(c)
        !           364:        int c;
        !           365: {
        !           366:        return (iscntrl(c));
        !           367: }
        !           368:
        !           369: /*
        !           370:  * Return the printable form of a character.
        !           371:  * For example, in the "ascii" charset '\3' is printed as "^C".
        !           372:  */
        !           373:        public char *
        !           374: prchar(c)
        !           375:        int c;
        !           376: {
        !           377:        static char buf[8];
        !           378:
        !           379:        c &= 0377;
        !           380:        if (!iscntrl(c))
        !           381:                snprintf(buf, sizeof(buf), "%c", c);
        !           382:        else if (c == ESC)
        !           383:                snprintf(buf, sizeof(buf), "ESC");
        !           384:        else if (c < 128 && !iscntrl(c ^ 0100))
        !           385:                snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
        !           386:        else
        !           387:                snprintf(buf, sizeof(buf), "*s<%X>", c);
        !           388:        return (buf);
        !           389: }
        !           390: #endif /* SMALL */