Annotation of src/usr.bin/less/charset.c, Revision 1.5
1.1 etheisen 1: /*
1.5 ! millert 2: * Copyright (C) 1984-2002 Mark Nudelman
1.1 etheisen 3: *
1.5 ! millert 4: * You may distribute under the terms of either the GNU General Public
! 5: * License or the Less License, as specified in the README file.
1.1 etheisen 6: *
1.5 ! millert 7: * For more information about less, or for information on how to
! 8: * contact the author, see the README file.
1.1 etheisen 9: */
10:
11:
12: /*
13: * Functions to define the character set
14: * and do things specific to the character set.
15: */
16:
17: #include "less.h"
18: #if HAVE_LOCALE
19: #include <locale.h>
20: #include <ctype.h>
21: #endif
22:
1.5 ! millert 23: public int utf_mode = 0;
! 24:
1.1 etheisen 25: /*
26: * Predefined character sets,
27: * selected by the LESSCHARSET environment variable.
28: */
29: struct charset {
30: char *name;
1.5 ! millert 31: int *p_flag;
1.1 etheisen 32: char *desc;
33: } charsets[] = {
1.5 ! millert 34: { "ascii", NULL, "8bcccbcc18b95.b" },
! 35: { "dos", NULL, "8bcccbcc12bc5b223.b" },
! 36: { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
! 37: { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
! 38: { "iso8859", NULL, "8bcccbcc18b95.33b." },
! 39: { "koi8-r", NULL, "8bcccbcc18b95.b128." },
! 40: { "next", NULL, "8bcccbcc18b95.bb125.bb" },
! 41: { "utf-8", &utf_mode, "8bcccbcc18b." },
! 42: { NULL, NULL, NULL }
! 43: };
! 44:
! 45: struct cs_alias {
! 46: char *name;
! 47: char *oname;
! 48: } cs_aliases[] = {
! 49: { "latin1", "iso8859" },
! 50: { "latin9", "iso8859" },
! 51: { NULL, NULL }
1.1 etheisen 52: };
53:
54: #define IS_BINARY_CHAR 01
55: #define IS_CONTROL_CHAR 02
56:
57: static char chardef[256];
58: static char *binfmt = NULL;
59: public int binattr = AT_STANDOUT;
60:
61:
62: /*
63: * Define a charset, given a description string.
64: * The string consists of 256 letters,
65: * one for each character in the charset.
66: * If the string is shorter than 256 letters, missing letters
67: * are taken to be identical to the last one.
68: * A decimal number followed by a letter is taken to be a
69: * repetition of the letter.
70: *
71: * Each letter is one of:
72: * . normal character
73: * b binary character
74: * c control character
75: */
76: static void
77: ichardef(s)
78: char *s;
79: {
1.5 ! millert 80: register char *cp;
! 81: register int n;
! 82: register char v;
1.1 etheisen 83:
84: n = 0;
85: v = 0;
86: cp = chardef;
87: while (*s != '\0')
88: {
89: switch (*s++)
90: {
91: case '.':
92: v = 0;
93: break;
94: case 'c':
95: v = IS_CONTROL_CHAR;
96: break;
97: case 'b':
98: v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
99: break;
100:
101: case '0': case '1': case '2': case '3': case '4':
102: case '5': case '6': case '7': case '8': case '9':
103: n = (10 * n) + (s[-1] - '0');
104: continue;
105:
106: default:
107: error("invalid chardef", NULL_PARG);
108: quit(QUIT_ERROR);
109: /*NOTREACHED*/
110: }
111:
112: do
113: {
114: if (cp >= chardef + sizeof(chardef))
115: {
116: error("chardef longer than 256", NULL_PARG);
117: quit(QUIT_ERROR);
118: /*NOTREACHED*/
119: }
120: *cp++ = v;
121: } while (--n > 0);
122: n = 0;
123: }
124:
125: while (cp < chardef + sizeof(chardef))
126: *cp++ = v;
127: }
128:
129: /*
130: * Define a charset, given a charset name.
131: * The valid charset names are listed in the "charsets" array.
132: */
133: static int
134: icharset(name)
1.5 ! millert 135: register char *name;
1.1 etheisen 136: {
1.5 ! millert 137: register struct charset *p;
! 138: register struct cs_alias *a;
1.1 etheisen 139:
140: if (name == NULL || *name == '\0')
141: return (0);
142:
1.5 ! millert 143: /* First see if the name is an alias. */
! 144: for (a = cs_aliases; a->name != NULL; a++)
! 145: {
! 146: if (strcmp(name, a->name) == 0)
! 147: {
! 148: name = a->oname;
! 149: break;
! 150: }
! 151: }
! 152:
1.1 etheisen 153: for (p = charsets; p->name != NULL; p++)
154: {
155: if (strcmp(name, p->name) == 0)
156: {
157: ichardef(p->desc);
1.5 ! millert 158: if (p->p_flag != NULL)
! 159: *(p->p_flag) = 1;
1.1 etheisen 160: return (1);
161: }
162: }
163:
164: error("invalid charset name", NULL_PARG);
165: quit(QUIT_ERROR);
166: /*NOTREACHED*/
1.5 ! millert 167: return (0);
1.1 etheisen 168: }
169:
170: #if HAVE_LOCALE
171: /*
172: * Define a charset, given a locale name.
173: */
174: static void
175: ilocale()
176: {
1.5 ! millert 177: register int c;
1.1 etheisen 178:
1.5 ! millert 179: setlocale(LC_ALL, "");
! 180: for (c = 0; c < (int) sizeof(chardef); c++)
1.1 etheisen 181: {
182: if (isprint(c))
183: chardef[c] = 0;
184: else if (iscntrl(c))
185: chardef[c] = IS_CONTROL_CHAR;
186: else
187: chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
188: }
189: }
190: #endif
191:
192: /*
193: * Define the printing format for control chars.
194: */
195: public void
196: setbinfmt(s)
197: char *s;
198: {
199: if (s == NULL || *s == '\0')
200: s = "*s<%X>";
201: /*
202: * Select the attributes if it starts with "*".
203: */
204: if (*s == '*')
205: {
206: switch (s[1])
207: {
208: case 'd': binattr = AT_BOLD; break;
209: case 'k': binattr = AT_BLINK; break;
210: case 's': binattr = AT_STANDOUT; break;
211: case 'u': binattr = AT_UNDERLINE; break;
212: default: binattr = AT_NORMAL; break;
213: }
214: s += 2;
215: }
216: binfmt = s;
217: }
218:
219: /*
220: * Initialize charset data structures.
221: */
222: public void
223: init_charset()
224: {
1.5 ! millert 225: register char *s;
1.1 etheisen 226:
1.5 ! millert 227: s = lgetenv("LESSBINFMT");
1.1 etheisen 228: setbinfmt(s);
229:
230: /*
231: * See if environment variable LESSCHARSET is defined.
232: */
1.5 ! millert 233: s = lgetenv("LESSCHARSET");
1.1 etheisen 234: if (icharset(s))
235: return;
236: /*
237: * LESSCHARSET is not defined: try LESSCHARDEF.
238: */
1.5 ! millert 239: s = lgetenv("LESSCHARDEF");
1.1 etheisen 240: if (s != NULL && *s != '\0')
241: {
242: ichardef(s);
243: return;
244: }
1.5 ! millert 245:
! 246: #if HAVE_STRSTR
! 247: /*
! 248: * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
! 249: */
! 250: if ((s = lgetenv("LC_ALL")) != NULL ||
! 251: (s = lgetenv("LC_CTYPE")) != NULL ||
! 252: (s = lgetenv("LANG")) != NULL)
! 253: {
! 254: if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
! 255: if (icharset("utf-8"))
! 256: return;
! 257: }
! 258: #endif
! 259:
1.1 etheisen 260: #if HAVE_LOCALE
261: /*
262: * Use setlocale.
263: */
264: ilocale();
265: #else
1.5 ! millert 266: #if MSDOS_COMPILER
1.1 etheisen 267: /*
1.5 ! millert 268: * Default to "dos".
1.1 etheisen 269: */
1.5 ! millert 270: (void) icharset("dos");
! 271: #else
! 272: /*
! 273: * Default to "latin1".
! 274: */
! 275: (void) icharset("latin1");
! 276: #endif
1.1 etheisen 277: #endif
278: }
279:
280: /*
281: * Is a given character a "binary" character?
282: */
283: public int
284: binary_char(c)
1.5 ! millert 285: unsigned char c;
1.1 etheisen 286: {
287: c &= 0377;
288: return (chardef[c] & IS_BINARY_CHAR);
289: }
290:
291: /*
292: * Is a given character a "control" character?
293: */
294: public int
295: control_char(c)
296: int c;
297: {
298: c &= 0377;
299: return (chardef[c] & IS_CONTROL_CHAR);
300: }
301:
302: /*
303: * Return the printable form of a character.
304: * For example, in the "ascii" charset '\3' is printed as "^C".
305: */
306: public char *
307: prchar(c)
308: int c;
309: {
310: static char buf[8];
311:
312: c &= 0377;
313: if (!control_char(c))
1.5 ! millert 314: snprintf(buf, sizeof(buf), "%c", c);
1.1 etheisen 315: else if (c == ESC)
1.5 ! millert 316: snprintf(buf, sizeof(buf), "ESC");
! 317: #if IS_EBCDIC_HOST
! 318: else if (!binary_char(c) && c < 64)
! 319: snprintf(buf, sizeof(buf), "^%c",
! 320: /*
! 321: * This array roughly inverts CONTROL() #defined in less.h,
! 322: * and should be kept in sync with CONTROL() and IBM-1047.
! 323: */
! 324: "@ABC.I.?...KLMNO"
! 325: "PQRS.JH.XY.."
! 326: "\\]^_"
! 327: "......W[.....EFG"
! 328: "..V....D....TU.Z"[c]);
! 329: #else
! 330: else if (c < 128 && !control_char(c ^ 0100))
! 331: snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
! 332: #endif
1.1 etheisen 333: else
1.5 ! millert 334: snprintf(buf, sizeof(buf), binfmt, c);
1.1 etheisen 335: return (buf);
336: }