Annotation of src/usr.bin/less/charset.c, Revision 1.7
1.1 etheisen 1: /*
1.5 millert 2: * Copyright (C) 1984-2002 Mark Nudelman
1.1 etheisen 3: *
1.5 millert 4: * You may distribute under the terms of either the GNU General Public
5: * License or the Less License, as specified in the README file.
1.1 etheisen 6: *
1.5 millert 7: * For more information about less, or for information on how to
8: * contact the author, see the README file.
1.1 etheisen 9: */
10:
11:
12: /*
13: * Functions to define the character set
14: * and do things specific to the character set.
15: */
16:
17: #include "less.h"
18: #if HAVE_LOCALE
19: #include <locale.h>
20: #include <ctype.h>
21: #endif
22:
1.5 millert 23: public int utf_mode = 0;
24:
1.6 millert 25: #if !SMALL
1.1 etheisen 26: /*
27: * Predefined character sets,
28: * selected by the LESSCHARSET environment variable.
29: */
30: struct charset {
31: char *name;
1.5 millert 32: int *p_flag;
1.1 etheisen 33: char *desc;
34: } charsets[] = {
1.5 millert 35: { "ascii", NULL, "8bcccbcc18b95.b" },
36: { "dos", NULL, "8bcccbcc12bc5b223.b" },
37: { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
38: { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
39: { "iso8859", NULL, "8bcccbcc18b95.33b." },
40: { "koi8-r", NULL, "8bcccbcc18b95.b128." },
41: { "next", NULL, "8bcccbcc18b95.bb125.bb" },
42: { "utf-8", &utf_mode, "8bcccbcc18b." },
43: { NULL, NULL, NULL }
44: };
45:
46: struct cs_alias {
47: char *name;
48: char *oname;
49: } cs_aliases[] = {
50: { "latin1", "iso8859" },
51: { "latin9", "iso8859" },
52: { NULL, NULL }
1.1 etheisen 53: };
54:
55: #define IS_BINARY_CHAR 01
56: #define IS_CONTROL_CHAR 02
57:
58: static char chardef[256];
59: static char *binfmt = NULL;
60: public int binattr = AT_STANDOUT;
61:
62:
63: /*
64: * Define a charset, given a description string.
65: * The string consists of 256 letters,
66: * one for each character in the charset.
67: * If the string is shorter than 256 letters, missing letters
68: * are taken to be identical to the last one.
69: * A decimal number followed by a letter is taken to be a
70: * repetition of the letter.
71: *
72: * Each letter is one of:
73: * . normal character
74: * b binary character
75: * c control character
76: */
77: static void
78: ichardef(s)
79: char *s;
80: {
1.5 millert 81: register char *cp;
82: register int n;
83: register char v;
1.1 etheisen 84:
85: n = 0;
86: v = 0;
87: cp = chardef;
88: while (*s != '\0')
89: {
90: switch (*s++)
91: {
92: case '.':
93: v = 0;
94: break;
95: case 'c':
96: v = IS_CONTROL_CHAR;
97: break;
98: case 'b':
99: v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
100: break;
101:
102: case '0': case '1': case '2': case '3': case '4':
103: case '5': case '6': case '7': case '8': case '9':
104: n = (10 * n) + (s[-1] - '0');
105: continue;
106:
107: default:
108: error("invalid chardef", NULL_PARG);
109: quit(QUIT_ERROR);
110: /*NOTREACHED*/
111: }
112:
113: do
114: {
115: if (cp >= chardef + sizeof(chardef))
116: {
117: error("chardef longer than 256", NULL_PARG);
118: quit(QUIT_ERROR);
119: /*NOTREACHED*/
120: }
121: *cp++ = v;
122: } while (--n > 0);
123: n = 0;
124: }
125:
126: while (cp < chardef + sizeof(chardef))
127: *cp++ = v;
128: }
129:
130: /*
131: * Define a charset, given a charset name.
132: * The valid charset names are listed in the "charsets" array.
133: */
134: static int
135: icharset(name)
1.5 millert 136: register char *name;
1.1 etheisen 137: {
1.5 millert 138: register struct charset *p;
139: register struct cs_alias *a;
1.1 etheisen 140:
141: if (name == NULL || *name == '\0')
142: return (0);
143:
1.5 millert 144: /* First see if the name is an alias. */
145: for (a = cs_aliases; a->name != NULL; a++)
146: {
147: if (strcmp(name, a->name) == 0)
148: {
149: name = a->oname;
150: break;
151: }
152: }
153:
1.1 etheisen 154: for (p = charsets; p->name != NULL; p++)
155: {
156: if (strcmp(name, p->name) == 0)
157: {
158: ichardef(p->desc);
1.5 millert 159: if (p->p_flag != NULL)
160: *(p->p_flag) = 1;
1.1 etheisen 161: return (1);
162: }
163: }
164:
165: error("invalid charset name", NULL_PARG);
166: quit(QUIT_ERROR);
167: /*NOTREACHED*/
1.5 millert 168: return (0);
1.1 etheisen 169: }
170:
171: #if HAVE_LOCALE
172: /*
173: * Define a charset, given a locale name.
174: */
175: static void
176: ilocale()
177: {
1.5 millert 178: register int c;
1.1 etheisen 179:
1.5 millert 180: setlocale(LC_ALL, "");
181: for (c = 0; c < (int) sizeof(chardef); c++)
1.1 etheisen 182: {
183: if (isprint(c))
184: chardef[c] = 0;
185: else if (iscntrl(c))
186: chardef[c] = IS_CONTROL_CHAR;
187: else
188: chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
189: }
190: }
191: #endif
192:
193: /*
194: * Define the printing format for control chars.
195: */
196: public void
197: setbinfmt(s)
198: char *s;
199: {
200: if (s == NULL || *s == '\0')
201: s = "*s<%X>";
202: /*
203: * Select the attributes if it starts with "*".
204: */
205: if (*s == '*')
206: {
207: switch (s[1])
208: {
209: case 'd': binattr = AT_BOLD; break;
210: case 'k': binattr = AT_BLINK; break;
211: case 's': binattr = AT_STANDOUT; break;
212: case 'u': binattr = AT_UNDERLINE; break;
213: default: binattr = AT_NORMAL; break;
214: }
215: s += 2;
216: }
217: binfmt = s;
218: }
219:
220: /*
221: * Initialize charset data structures.
222: */
223: public void
224: init_charset()
225: {
1.5 millert 226: register char *s;
1.1 etheisen 227:
1.5 millert 228: s = lgetenv("LESSBINFMT");
1.1 etheisen 229: setbinfmt(s);
230:
231: /*
232: * See if environment variable LESSCHARSET is defined.
233: */
1.5 millert 234: s = lgetenv("LESSCHARSET");
1.1 etheisen 235: if (icharset(s))
236: return;
237: /*
238: * LESSCHARSET is not defined: try LESSCHARDEF.
239: */
1.5 millert 240: s = lgetenv("LESSCHARDEF");
1.1 etheisen 241: if (s != NULL && *s != '\0')
242: {
243: ichardef(s);
244: return;
245: }
1.5 millert 246:
247: #if HAVE_STRSTR
248: /*
249: * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
250: */
251: if ((s = lgetenv("LC_ALL")) != NULL ||
252: (s = lgetenv("LC_CTYPE")) != NULL ||
253: (s = lgetenv("LANG")) != NULL)
254: {
255: if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
256: if (icharset("utf-8"))
257: return;
258: }
259: #endif
260:
1.1 etheisen 261: #if HAVE_LOCALE
262: /*
263: * Use setlocale.
264: */
265: ilocale();
266: #else
1.5 millert 267: #if MSDOS_COMPILER
1.1 etheisen 268: /*
1.5 millert 269: * Default to "dos".
1.1 etheisen 270: */
1.5 millert 271: (void) icharset("dos");
272: #else
273: /*
274: * Default to "latin1".
275: */
276: (void) icharset("latin1");
277: #endif
1.1 etheisen 278: #endif
279: }
280:
281: /*
282: * Is a given character a "binary" character?
283: */
284: public int
285: binary_char(c)
1.5 millert 286: unsigned char c;
1.1 etheisen 287: {
288: c &= 0377;
289: return (chardef[c] & IS_BINARY_CHAR);
290: }
291:
292: /*
293: * Is a given character a "control" character?
294: */
295: public int
296: control_char(c)
297: int c;
298: {
299: c &= 0377;
300: return (chardef[c] & IS_CONTROL_CHAR);
301: }
302:
303: /*
304: * Return the printable form of a character.
305: * For example, in the "ascii" charset '\3' is printed as "^C".
306: */
307: public char *
308: prchar(c)
309: int c;
310: {
311: static char buf[8];
312:
313: c &= 0377;
314: if (!control_char(c))
1.5 millert 315: snprintf(buf, sizeof(buf), "%c", c);
1.1 etheisen 316: else if (c == ESC)
1.5 millert 317: snprintf(buf, sizeof(buf), "ESC");
318: #if IS_EBCDIC_HOST
319: else if (!binary_char(c) && c < 64)
320: snprintf(buf, sizeof(buf), "^%c",
321: /*
322: * This array roughly inverts CONTROL() #defined in less.h,
323: * and should be kept in sync with CONTROL() and IBM-1047.
324: */
325: "@ABC.I.?...KLMNO"
326: "PQRS.JH.XY.."
327: "\\]^_"
328: "......W[.....EFG"
329: "..V....D....TU.Z"[c]);
330: #else
331: else if (c < 128 && !control_char(c ^ 0100))
332: snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
333: #endif
1.1 etheisen 334: else
1.5 millert 335: snprintf(buf, sizeof(buf), binfmt, c);
1.1 etheisen 336: return (buf);
337: }
1.6 millert 338:
339: #else /* SMALL */
340:
341: public int binattr = AT_STANDOUT;
342:
343: public void
344: init_charset()
345: {
346: return;
347: }
348:
349: /*
350: * Is a given character a "binary" character?
351: */
352: public int
353: binary_char(c)
354: unsigned char c;
355: {
1.7 ! millert 356: return (!isprint(c) && !isspace(c));
1.6 millert 357: }
358:
359: /*
360: * Is a given character a "control" character?
361: */
362: public int
363: control_char(c)
364: int c;
365: {
366: return (iscntrl(c));
367: }
368:
369: /*
370: * Return the printable form of a character.
371: * For example, in the "ascii" charset '\3' is printed as "^C".
372: */
373: public char *
374: prchar(c)
375: int c;
376: {
377: static char buf[8];
378:
379: c &= 0377;
380: if (!iscntrl(c))
381: snprintf(buf, sizeof(buf), "%c", c);
382: else if (c == ESC)
383: snprintf(buf, sizeof(buf), "ESC");
384: else if (c < 128 && !iscntrl(c ^ 0100))
385: snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
386: else
387: snprintf(buf, sizeof(buf), "*s<%X>", c);
388: return (buf);
389: }
390: #endif /* SMALL */