Annotation of src/usr.bin/less/charset.c, Revision 1.6
1.1 etheisen 1: /*
1.5 millert 2: * Copyright (C) 1984-2002 Mark Nudelman
1.1 etheisen 3: *
1.5 millert 4: * You may distribute under the terms of either the GNU General Public
5: * License or the Less License, as specified in the README file.
1.1 etheisen 6: *
1.5 millert 7: * For more information about less, or for information on how to
8: * contact the author, see the README file.
1.1 etheisen 9: */
10:
11:
12: /*
13: * Functions to define the character set
14: * and do things specific to the character set.
15: */
16:
17: #include "less.h"
18: #if HAVE_LOCALE
19: #include <locale.h>
20: #include <ctype.h>
21: #endif
22:
1.5 millert 23: public int utf_mode = 0;
24:
1.6 ! millert 25: #if !SMALL
1.1 etheisen 26: /*
27: * Predefined character sets,
28: * selected by the LESSCHARSET environment variable.
29: */
30: struct charset {
31: char *name;
1.5 millert 32: int *p_flag;
1.1 etheisen 33: char *desc;
34: } charsets[] = {
1.5 millert 35: { "ascii", NULL, "8bcccbcc18b95.b" },
36: { "dos", NULL, "8bcccbcc12bc5b223.b" },
37: { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
38: { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
39: { "iso8859", NULL, "8bcccbcc18b95.33b." },
40: { "koi8-r", NULL, "8bcccbcc18b95.b128." },
41: { "next", NULL, "8bcccbcc18b95.bb125.bb" },
42: { "utf-8", &utf_mode, "8bcccbcc18b." },
43: { NULL, NULL, NULL }
44: };
45:
46: struct cs_alias {
47: char *name;
48: char *oname;
49: } cs_aliases[] = {
50: { "latin1", "iso8859" },
51: { "latin9", "iso8859" },
52: { NULL, NULL }
1.1 etheisen 53: };
54:
55: #define IS_BINARY_CHAR 01
56: #define IS_CONTROL_CHAR 02
57:
58: static char chardef[256];
59: static char *binfmt = NULL;
60: public int binattr = AT_STANDOUT;
61:
62:
63: /*
64: * Define a charset, given a description string.
65: * The string consists of 256 letters,
66: * one for each character in the charset.
67: * If the string is shorter than 256 letters, missing letters
68: * are taken to be identical to the last one.
69: * A decimal number followed by a letter is taken to be a
70: * repetition of the letter.
71: *
72: * Each letter is one of:
73: * . normal character
74: * b binary character
75: * c control character
76: */
77: static void
78: ichardef(s)
79: char *s;
80: {
1.5 millert 81: register char *cp;
82: register int n;
83: register char v;
1.1 etheisen 84:
85: n = 0;
86: v = 0;
87: cp = chardef;
88: while (*s != '\0')
89: {
90: switch (*s++)
91: {
92: case '.':
93: v = 0;
94: break;
95: case 'c':
96: v = IS_CONTROL_CHAR;
97: break;
98: case 'b':
99: v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
100: break;
101:
102: case '0': case '1': case '2': case '3': case '4':
103: case '5': case '6': case '7': case '8': case '9':
104: n = (10 * n) + (s[-1] - '0');
105: continue;
106:
107: default:
108: error("invalid chardef", NULL_PARG);
109: quit(QUIT_ERROR);
110: /*NOTREACHED*/
111: }
112:
113: do
114: {
115: if (cp >= chardef + sizeof(chardef))
116: {
117: error("chardef longer than 256", NULL_PARG);
118: quit(QUIT_ERROR);
119: /*NOTREACHED*/
120: }
121: *cp++ = v;
122: } while (--n > 0);
123: n = 0;
124: }
125:
126: while (cp < chardef + sizeof(chardef))
127: *cp++ = v;
128: }
129:
130: /*
131: * Define a charset, given a charset name.
132: * The valid charset names are listed in the "charsets" array.
133: */
134: static int
135: icharset(name)
1.5 millert 136: register char *name;
1.1 etheisen 137: {
1.5 millert 138: register struct charset *p;
139: register struct cs_alias *a;
1.1 etheisen 140:
141: if (name == NULL || *name == '\0')
142: return (0);
143:
1.5 millert 144: /* First see if the name is an alias. */
145: for (a = cs_aliases; a->name != NULL; a++)
146: {
147: if (strcmp(name, a->name) == 0)
148: {
149: name = a->oname;
150: break;
151: }
152: }
153:
1.1 etheisen 154: for (p = charsets; p->name != NULL; p++)
155: {
156: if (strcmp(name, p->name) == 0)
157: {
158: ichardef(p->desc);
1.5 millert 159: if (p->p_flag != NULL)
160: *(p->p_flag) = 1;
1.1 etheisen 161: return (1);
162: }
163: }
164:
165: error("invalid charset name", NULL_PARG);
166: quit(QUIT_ERROR);
167: /*NOTREACHED*/
1.5 millert 168: return (0);
1.1 etheisen 169: }
170:
171: #if HAVE_LOCALE
172: /*
173: * Define a charset, given a locale name.
174: */
175: static void
176: ilocale()
177: {
1.5 millert 178: register int c;
1.1 etheisen 179:
1.5 millert 180: setlocale(LC_ALL, "");
181: for (c = 0; c < (int) sizeof(chardef); c++)
1.1 etheisen 182: {
183: if (isprint(c))
184: chardef[c] = 0;
185: else if (iscntrl(c))
186: chardef[c] = IS_CONTROL_CHAR;
187: else
188: chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
189: }
190: }
191: #endif
192:
193: /*
194: * Define the printing format for control chars.
195: */
196: public void
197: setbinfmt(s)
198: char *s;
199: {
200: if (s == NULL || *s == '\0')
201: s = "*s<%X>";
202: /*
203: * Select the attributes if it starts with "*".
204: */
205: if (*s == '*')
206: {
207: switch (s[1])
208: {
209: case 'd': binattr = AT_BOLD; break;
210: case 'k': binattr = AT_BLINK; break;
211: case 's': binattr = AT_STANDOUT; break;
212: case 'u': binattr = AT_UNDERLINE; break;
213: default: binattr = AT_NORMAL; break;
214: }
215: s += 2;
216: }
217: binfmt = s;
218: }
219:
220: /*
221: * Initialize charset data structures.
222: */
223: public void
224: init_charset()
225: {
1.5 millert 226: register char *s;
1.1 etheisen 227:
1.5 millert 228: s = lgetenv("LESSBINFMT");
1.1 etheisen 229: setbinfmt(s);
230:
231: /*
232: * See if environment variable LESSCHARSET is defined.
233: */
1.5 millert 234: s = lgetenv("LESSCHARSET");
1.1 etheisen 235: if (icharset(s))
236: return;
237: /*
238: * LESSCHARSET is not defined: try LESSCHARDEF.
239: */
1.5 millert 240: s = lgetenv("LESSCHARDEF");
1.1 etheisen 241: if (s != NULL && *s != '\0')
242: {
243: ichardef(s);
244: return;
245: }
1.5 millert 246:
247: #if HAVE_STRSTR
248: /*
249: * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
250: */
251: if ((s = lgetenv("LC_ALL")) != NULL ||
252: (s = lgetenv("LC_CTYPE")) != NULL ||
253: (s = lgetenv("LANG")) != NULL)
254: {
255: if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
256: if (icharset("utf-8"))
257: return;
258: }
259: #endif
260:
1.1 etheisen 261: #if HAVE_LOCALE
262: /*
263: * Use setlocale.
264: */
265: ilocale();
266: #else
1.5 millert 267: #if MSDOS_COMPILER
1.1 etheisen 268: /*
1.5 millert 269: * Default to "dos".
1.1 etheisen 270: */
1.5 millert 271: (void) icharset("dos");
272: #else
273: /*
274: * Default to "latin1".
275: */
276: (void) icharset("latin1");
277: #endif
1.1 etheisen 278: #endif
279: }
280:
281: /*
282: * Is a given character a "binary" character?
283: */
284: public int
285: binary_char(c)
1.5 millert 286: unsigned char c;
1.1 etheisen 287: {
288: c &= 0377;
289: return (chardef[c] & IS_BINARY_CHAR);
290: }
291:
292: /*
293: * Is a given character a "control" character?
294: */
295: public int
296: control_char(c)
297: int c;
298: {
299: c &= 0377;
300: return (chardef[c] & IS_CONTROL_CHAR);
301: }
302:
303: /*
304: * Return the printable form of a character.
305: * For example, in the "ascii" charset '\3' is printed as "^C".
306: */
307: public char *
308: prchar(c)
309: int c;
310: {
311: static char buf[8];
312:
313: c &= 0377;
314: if (!control_char(c))
1.5 millert 315: snprintf(buf, sizeof(buf), "%c", c);
1.1 etheisen 316: else if (c == ESC)
1.5 millert 317: snprintf(buf, sizeof(buf), "ESC");
318: #if IS_EBCDIC_HOST
319: else if (!binary_char(c) && c < 64)
320: snprintf(buf, sizeof(buf), "^%c",
321: /*
322: * This array roughly inverts CONTROL() #defined in less.h,
323: * and should be kept in sync with CONTROL() and IBM-1047.
324: */
325: "@ABC.I.?...KLMNO"
326: "PQRS.JH.XY.."
327: "\\]^_"
328: "......W[.....EFG"
329: "..V....D....TU.Z"[c]);
330: #else
331: else if (c < 128 && !control_char(c ^ 0100))
332: snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
333: #endif
1.1 etheisen 334: else
1.5 millert 335: snprintf(buf, sizeof(buf), binfmt, c);
1.1 etheisen 336: return (buf);
337: }
1.6 ! millert 338:
! 339: #else /* SMALL */
! 340:
! 341: public int binattr = AT_STANDOUT;
! 342:
! 343: public void
! 344: init_charset()
! 345: {
! 346: return;
! 347: }
! 348:
! 349: /*
! 350: * Is a given character a "binary" character?
! 351: */
! 352: public int
! 353: binary_char(c)
! 354: unsigned char c;
! 355: {
! 356: return (!isprint(c));
! 357: }
! 358:
! 359: /*
! 360: * Is a given character a "control" character?
! 361: */
! 362: public int
! 363: control_char(c)
! 364: int c;
! 365: {
! 366: return (iscntrl(c));
! 367: }
! 368:
! 369: /*
! 370: * Return the printable form of a character.
! 371: * For example, in the "ascii" charset '\3' is printed as "^C".
! 372: */
! 373: public char *
! 374: prchar(c)
! 375: int c;
! 376: {
! 377: static char buf[8];
! 378:
! 379: c &= 0377;
! 380: if (!iscntrl(c))
! 381: snprintf(buf, sizeof(buf), "%c", c);
! 382: else if (c == ESC)
! 383: snprintf(buf, sizeof(buf), "ESC");
! 384: else if (c < 128 && !iscntrl(c ^ 0100))
! 385: snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
! 386: else
! 387: snprintf(buf, sizeof(buf), "*s<%X>", c);
! 388: return (buf);
! 389: }
! 390: #endif /* SMALL */