Annotation of src/usr.bin/file/ascmagic.c, Revision 1.8
1.8 ! tedu 1: /* $OpenBSD$ */
1.1 deraadt 2: /*
1.6 ian 3: * Copyright (c) Ian F. Darwin 1986-1995.
4: * Software written by Ian F. Darwin and others;
5: * maintained 1995-present by Christos Zoulas and others.
6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice immediately at the beginning of the file, without modification,
12: * this list of conditions, and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
21: * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
1.1 deraadt 28: */
1.8 ! tedu 29: /*
! 30: * ASCII magic -- file types that we know based on keywords
! 31: * that can appear anywhere in the file.
! 32: *
! 33: * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
! 34: * to handle character codes other than ASCII on a unified basis.
! 35: *
! 36: * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
! 37: * international characters, now subsumed into this file.
! 38: */
1.1 deraadt 39:
1.8 ! tedu 40: #include "file.h"
! 41: #include "magic.h"
1.1 deraadt 42: #include <stdio.h>
43: #include <string.h>
1.8 ! tedu 44: #include <memory.h>
1.1 deraadt 45: #include <ctype.h>
46: #include <stdlib.h>
1.8 ! tedu 47: #ifdef HAVE_UNISTD_H
1.1 deraadt 48: #include <unistd.h>
1.8 ! tedu 49: #endif
1.1 deraadt 50: #include "names.h"
51:
52: #ifndef lint
1.8 ! tedu 53: FILE_RCSID("@(#)$Id: ascmagic.c,v 1.40 2003/11/20 00:25:39 christos Exp $")
1.1 deraadt 54: #endif /* lint */
55:
1.8 ! tedu 56: typedef unsigned long unichar;
! 57:
! 58: #define MAXLINELEN 300 /* longest sane line length */
! 59: #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
! 60: || (x) == 0x85 || (x) == '\f')
! 61:
! 62: private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
! 63: private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
! 64: private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
! 65: private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
! 66: private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
! 67: private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
! 68: private int ascmatch(const unsigned char *, const unichar *, size_t);
! 69:
1.1 deraadt 70:
1.8 ! tedu 71: protected int
! 72: file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
! 73: {
! 74: size_t i;
! 75: unsigned char nbuf[HOWMANY+1]; /* one extra for terminating '\0' */
! 76: unichar ubuf[HOWMANY+1]; /* one extra for terminating '\0' */
! 77: size_t ulen;
1.5 mpech 78: struct names *p;
1.1 deraadt 79:
1.8 ! tedu 80: const char *code = NULL;
! 81: const char *code_mime = NULL;
! 82: const char *type = NULL;
! 83: const char *subtype = NULL;
! 84: const char *subtype_mime = NULL;
! 85:
! 86: int has_escapes = 0;
! 87: int has_backspace = 0;
! 88:
! 89: int n_crlf = 0;
! 90: int n_lf = 0;
! 91: int n_cr = 0;
! 92: int n_nel = 0;
! 93:
! 94: int last_line_end = -1;
! 95: int has_long_lines = 0;
! 96:
1.1 deraadt 97: /*
1.8 ! tedu 98: * Undo the NUL-termination kindly provided by process()
! 99: * but leave at least one byte to look at
1.1 deraadt 100: */
1.8 ! tedu 101:
! 102: while (nbytes > 1 && buf[nbytes - 1] == '\0')
! 103: nbytes--;
! 104:
! 105: /* nbuf and ubuf relies on this */
! 106: if (nbytes > HOWMANY)
! 107: nbytes = HOWMANY;
! 108:
! 109: /*
! 110: * Then try to determine whether it's any character code we can
! 111: * identify. Each of these tests, if it succeeds, will leave
! 112: * the text converted into one-unichar-per-character Unicode in
! 113: * ubuf, and the number of characters converted in ulen.
! 114: */
! 115: if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
! 116: code = "ASCII";
! 117: code_mime = "us-ascii";
! 118: type = "text";
! 119: } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
! 120: code = "UTF-8 Unicode";
! 121: code_mime = "utf-8";
! 122: type = "text";
! 123: } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
! 124: if (i == 1)
! 125: code = "Little-endian UTF-16 Unicode";
! 126: else
! 127: code = "Big-endian UTF-16 Unicode";
! 128:
! 129: type = "character data";
! 130: code_mime = "utf-16"; /* is this defined? */
! 131: } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
! 132: code = "ISO-8859";
! 133: type = "text";
! 134: code_mime = "iso-8859-1";
! 135: } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
! 136: code = "Non-ISO extended-ASCII";
! 137: type = "text";
! 138: code_mime = "unknown";
! 139: } else {
! 140: from_ebcdic(buf, nbytes, nbuf);
! 141:
! 142: if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
! 143: code = "EBCDIC";
! 144: type = "character data";
! 145: code_mime = "ebcdic";
! 146: } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
! 147: code = "International EBCDIC";
! 148: type = "character data";
! 149: code_mime = "ebcdic";
! 150: } else {
! 151: return 0; /* doesn't look like text at all */
! 152: }
1.1 deraadt 153: }
154:
155: /*
156: * for troff, look for . + letter + letter or .\";
157: * this must be done to disambiguate tar archives' ./file
158: * and other trash from real troff input.
1.8 ! tedu 159: *
! 160: * I believe Plan 9 troff allows non-ASCII characters in the names
! 161: * of macros, so this test might possibly fail on such a file.
1.1 deraadt 162: */
1.8 ! tedu 163: if (*ubuf == '.') {
! 164: unichar *tp = ubuf + 1;
1.1 deraadt 165:
1.8 ! tedu 166: while (ISSPC(*tp))
1.1 deraadt 167: ++tp; /* skip leading whitespace */
1.8 ! tedu 168: if ((tp[0] == '\\' && tp[1] == '\"') ||
! 169: (isascii((unsigned char)tp[0]) &&
! 170: isalnum((unsigned char)tp[0]) &&
! 171: isascii((unsigned char)tp[1]) &&
! 172: isalnum((unsigned char)tp[1]) &&
! 173: ISSPC(tp[2]))) {
! 174: subtype_mime = "text/troff";
! 175: subtype = "troff or preprocessor input";
! 176: goto subtype_identified;
! 177: }
! 178: }
! 179:
! 180: if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
! 181: subtype_mime = "text/fortran";
! 182: subtype = "fortran program";
! 183: goto subtype_identified;
! 184: }
! 185:
! 186: /* look for tokens from names.h - this is expensive! */
! 187:
! 188: i = 0;
! 189: while (i < ulen) {
! 190: size_t end;
! 191:
! 192: /*
! 193: * skip past any leading space
! 194: */
! 195: while (i < ulen && ISSPC(ubuf[i]))
! 196: i++;
! 197: if (i >= ulen)
! 198: break;
! 199:
! 200: /*
! 201: * find the next whitespace
! 202: */
! 203: for (end = i + 1; end < nbytes; end++)
! 204: if (ISSPC(ubuf[end]))
! 205: break;
! 206:
! 207: /*
! 208: * compare the word thus isolated against the token list
! 209: */
! 210: for (p = names; p < names + NNAMES; p++) {
! 211: if (ascmatch((const unsigned char *)p->name, ubuf + i,
! 212: end - i)) {
! 213: subtype = types[p->type].human;
! 214: subtype_mime = types[p->type].mime;
! 215: goto subtype_identified;
! 216: }
! 217: }
! 218:
! 219: i = end;
! 220: }
! 221:
! 222: subtype_identified:
! 223:
! 224: /*
! 225: * Now try to discover other details about the file.
! 226: */
! 227: for (i = 0; i < ulen; i++) {
! 228: if (i > last_line_end + MAXLINELEN)
! 229: has_long_lines = 1;
! 230:
! 231: if (ubuf[i] == '\033')
! 232: has_escapes = 1;
! 233: if (ubuf[i] == '\b')
! 234: has_backspace = 1;
! 235:
! 236: if (ubuf[i] == '\r' && (i + 1 < ulen && ubuf[i + 1] == '\n')) {
! 237: n_crlf++;
! 238: last_line_end = i;
! 239: }
! 240: if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
! 241: n_cr++;
! 242: last_line_end = i;
! 243: }
! 244: if (ubuf[i] == '\n' && ((int)i - 1 < 0 || ubuf[i - 1] != '\r')){
! 245: n_lf++;
! 246: last_line_end = i;
! 247: }
! 248: if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
! 249: n_nel++;
! 250: last_line_end = i;
1.1 deraadt 251: }
252: }
1.8 ! tedu 253:
! 254: if ((ms->flags & MAGIC_MIME)) {
! 255: if (subtype_mime) {
! 256: if (file_printf(ms, subtype_mime) == -1)
! 257: return -1;
! 258: } else {
! 259: if (file_printf(ms, "text/plain") == -1)
! 260: return -1;
! 261: }
! 262:
! 263: if (code_mime) {
! 264: if (file_printf(ms, "; charset=") == -1)
! 265: return -1;
! 266: if (file_printf(ms, code_mime) == -1)
! 267: return -1;
! 268: }
! 269: } else {
! 270: if (file_printf(ms, code) == -1)
! 271: return -1;
! 272:
! 273: if (subtype) {
! 274: if (file_printf(ms, " ") == -1)
! 275: return -1;
! 276: if (file_printf(ms, subtype) == -1)
! 277: return -1;
! 278: }
! 279:
! 280: if (file_printf(ms, " ") == -1)
! 281: return -1;
! 282: if (file_printf(ms, type) == -1)
! 283: return -1;
! 284:
! 285: if (has_long_lines)
! 286: if (file_printf(ms, ", with very long lines") == -1)
! 287: return -1;
! 288:
! 289: /*
! 290: * Only report line terminators if we find one other than LF,
! 291: * or if we find none at all.
! 292: */
! 293: if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
! 294: (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
! 295: if (file_printf(ms, ", with") == -1)
! 296: return -1;
! 297:
! 298: if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
! 299: if (file_printf(ms, " no") == -1)
! 300: return -1;
! 301: } else {
! 302: if (n_crlf) {
! 303: if (file_printf(ms, " CRLF") == -1)
! 304: return -1;
! 305: if (n_cr || n_lf || n_nel)
! 306: if (file_printf(ms, ",") == -1)
! 307: return -1;
! 308: }
! 309: if (n_cr) {
! 310: if (file_printf(ms, " CR") == -1)
! 311: return -1;
! 312: if (n_lf || n_nel)
! 313: if (file_printf(ms, ",") == -1)
! 314: return -1;
! 315: }
! 316: if (n_lf) {
! 317: if (file_printf(ms, " LF") == -1)
! 318: return -1;
! 319: if (n_nel)
! 320: if (file_printf(ms, ",") == -1)
! 321: return -1;
! 322: }
! 323: if (n_nel)
! 324: if (file_printf(ms, " NEL") == -1)
! 325: return -1;
! 326: }
! 327:
! 328: if (file_printf(ms, " line terminators") == -1)
! 329: return -1;
! 330: }
! 331:
! 332: if (has_escapes)
! 333: if (file_printf(ms, ", with escape sequences") == -1)
! 334: return -1;
! 335: if (has_backspace)
! 336: if (file_printf(ms, ", with overstriking") == -1)
! 337: return -1;
! 338: }
! 339:
! 340: return 1;
! 341: }
! 342:
! 343: private int
! 344: ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
! 345: {
! 346: size_t i;
! 347:
! 348: for (i = 0; i < ulen; i++) {
! 349: if (s[i] != us[i])
! 350: return 0;
! 351: }
! 352:
! 353: if (s[i])
! 354: return 0;
! 355: else
1.1 deraadt 356: return 1;
1.8 ! tedu 357: }
! 358:
! 359: /*
! 360: * This table reflects a particular philosophy about what constitutes
! 361: * "text," and there is room for disagreement about it.
! 362: *
! 363: * Version 3.31 of the file command considered a file to be ASCII if
! 364: * each of its characters was approved by either the isascii() or
! 365: * isalpha() function. On most systems, this would mean that any
! 366: * file consisting only of characters in the range 0x00 ... 0x7F
! 367: * would be called ASCII text, but many systems might reasonably
! 368: * consider some characters outside this range to be alphabetic,
! 369: * so the file command would call such characters ASCII. It might
! 370: * have been more accurate to call this "considered textual on the
! 371: * local system" than "ASCII."
! 372: *
! 373: * It considered a file to be "International language text" if each
! 374: * of its characters was either an ASCII printing character (according
! 375: * to the real ASCII standard, not the above test), a character in
! 376: * the range 0x80 ... 0xFF, or one of the following control characters:
! 377: * backspace, tab, line feed, vertical tab, form feed, carriage return,
! 378: * escape. No attempt was made to determine the language in which files
! 379: * of this type were written.
! 380: *
! 381: *
! 382: * The table below considers a file to be ASCII if all of its characters
! 383: * are either ASCII printing characters (again, according to the X3.4
! 384: * standard, not isascii()) or any of the following controls: bell,
! 385: * backspace, tab, line feed, form feed, carriage return, esc, nextline.
! 386: *
! 387: * I include bell because some programs (particularly shell scripts)
! 388: * use it literally, even though it is rare in normal text. I exclude
! 389: * vertical tab because it never seems to be used in real text. I also
! 390: * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
! 391: * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
! 392: * character to. It might be more appropriate to include it in the 8859
! 393: * set instead of the ASCII set, but it's got to be included in *something*
! 394: * we recognize or EBCDIC files aren't going to be considered textual.
! 395: * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
! 396: * and Latin characters, so these should possibly be allowed. But they
! 397: * make a real mess on VT100-style displays if they're not paired properly,
! 398: * so we are probably better off not calling them text.
! 399: *
! 400: * A file is considered to be ISO-8859 text if its characters are all
! 401: * either ASCII, according to the above definition, or printing characters
! 402: * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
! 403: *
! 404: * Finally, a file is considered to be international text from some other
! 405: * character code if its characters are all either ISO-8859 (according to
! 406: * the above definition) or characters in the range 0x80 ... 0x9F, which
! 407: * ISO-8859 considers to be control characters but the IBM PC and Macintosh
! 408: * consider to be printing characters.
! 409: */
! 410:
! 411: #define F 0 /* character never appears in text */
! 412: #define T 1 /* character appears in plain ASCII text */
! 413: #define I 2 /* character appears in ISO-8859 text */
! 414: #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
! 415:
! 416: private char text_chars[256] = {
! 417: /* BEL BS HT LF FF CR */
! 418: F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
! 419: /* ESC */
! 420: F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
! 421: T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
! 422: T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
! 423: T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
! 424: T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
! 425: T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
! 426: T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
! 427: /* NEL */
! 428: X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
! 429: X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
! 430: I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
! 431: I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
! 432: I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
! 433: I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
! 434: I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
! 435: I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
! 436: };
! 437:
! 438: private int
! 439: looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
! 440: size_t *ulen)
! 441: {
! 442: int i;
! 443:
! 444: *ulen = 0;
! 445:
! 446: for (i = 0; i < nbytes; i++) {
! 447: int t = text_chars[buf[i]];
! 448:
! 449: if (t != T)
! 450: return 0;
! 451:
! 452: ubuf[(*ulen)++] = buf[i];
1.1 deraadt 453: }
454:
1.8 ! tedu 455: return 1;
! 456: }
! 457:
! 458: private int
! 459: looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
! 460: {
! 461: int i;
! 462:
! 463: *ulen = 0;
1.3 millert 464:
465: for (i = 0; i < nbytes; i++) {
1.8 ! tedu 466: int t = text_chars[buf[i]];
! 467:
! 468: if (t != T && t != I)
! 469: return 0;
! 470:
! 471: ubuf[(*ulen)++] = buf[i];
1.3 millert 472: }
473:
1.8 ! tedu 474: return 1;
! 475: }
! 476:
! 477: private int
! 478: looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
! 479: size_t *ulen)
! 480: {
! 481: int i;
! 482:
! 483: *ulen = 0;
! 484:
! 485: for (i = 0; i < nbytes; i++) {
! 486: int t = text_chars[buf[i]];
! 487:
! 488: if (t != T && t != I && t != X)
! 489: return 0;
! 490:
! 491: ubuf[(*ulen)++] = buf[i];
! 492: }
! 493:
! 494: return 1;
! 495: }
! 496:
! 497: private int
! 498: looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
! 499: {
! 500: int i, n;
! 501: unichar c;
! 502: int gotone = 0;
! 503:
! 504: *ulen = 0;
! 505:
! 506: for (i = 0; i < nbytes; i++) {
! 507: if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
! 508: /*
! 509: * Even if the whole file is valid UTF-8 sequences,
! 510: * still reject it if it uses weird control characters.
! 511: */
! 512:
! 513: if (text_chars[buf[i]] != T)
! 514: return 0;
! 515:
! 516: ubuf[(*ulen)++] = buf[i];
! 517: } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
! 518: return 0;
! 519: } else { /* 11xxxxxx begins UTF-8 */
! 520: int following;
! 521:
! 522: if ((buf[i] & 0x20) == 0) { /* 110xxxxx */
! 523: c = buf[i] & 0x1f;
! 524: following = 1;
! 525: } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */
! 526: c = buf[i] & 0x0f;
! 527: following = 2;
! 528: } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */
! 529: c = buf[i] & 0x07;
! 530: following = 3;
! 531: } else if ((buf[i] & 0x04) == 0) { /* 111110xx */
! 532: c = buf[i] & 0x03;
! 533: following = 4;
! 534: } else if ((buf[i] & 0x02) == 0) { /* 1111110x */
! 535: c = buf[i] & 0x01;
! 536: following = 5;
! 537: } else
! 538: return 0;
! 539:
! 540: for (n = 0; n < following; n++) {
! 541: i++;
! 542: if (i >= nbytes)
! 543: goto done;
! 544:
! 545: if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
! 546: return 0;
! 547:
! 548: c = (c << 6) + (buf[i] & 0x3f);
1.1 deraadt 549: }
1.8 ! tedu 550:
! 551: ubuf[(*ulen)++] = c;
! 552: gotone = 1;
1.1 deraadt 553: }
554: }
1.8 ! tedu 555: done:
! 556: return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
! 557: }
! 558:
! 559: private int
! 560: looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
! 561: size_t *ulen)
! 562: {
! 563: int bigend;
! 564: int i;
1.1 deraadt 565:
1.8 ! tedu 566: if (nbytes < 2)
! 567: return 0;
! 568:
! 569: if (buf[0] == 0xff && buf[1] == 0xfe)
! 570: bigend = 0;
! 571: else if (buf[0] == 0xfe && buf[1] == 0xff)
! 572: bigend = 1;
! 573: else
! 574: return 0;
! 575:
! 576: *ulen = 0;
! 577:
! 578: for (i = 2; i + 1 < nbytes; i += 2) {
! 579: /* XXX fix to properly handle chars > 65536 */
! 580:
! 581: if (bigend)
! 582: ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
! 583: else
! 584: ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
! 585:
! 586: if (ubuf[*ulen - 1] == 0xfffe)
! 587: return 0;
! 588: if (ubuf[*ulen - 1] < 128 &&
! 589: text_chars[(size_t)ubuf[*ulen - 1]] != T)
! 590: return 0;
1.1 deraadt 591: }
1.8 ! tedu 592:
! 593: return 1 + bigend;
1.1 deraadt 594: }
595:
1.8 ! tedu 596: #undef F
! 597: #undef T
! 598: #undef I
! 599: #undef X
! 600:
! 601: /*
! 602: * This table maps each EBCDIC character to an (8-bit extended) ASCII
! 603: * character, as specified in the rationale for the dd(1) command in
! 604: * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
! 605: *
! 606: * Unfortunately it does not seem to correspond exactly to any of the
! 607: * five variants of EBCDIC documented in IBM's _Enterprise Systems
! 608: * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
! 609: * Edition, July, 1999, pp. I-1 - I-4.
! 610: *
! 611: * Fortunately, though, all versions of EBCDIC, including this one, agree
! 612: * on most of the printing characters that also appear in (7-bit) ASCII.
! 613: * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
! 614: *
! 615: * Fortunately too, there is general agreement that codes 0x00 through
! 616: * 0x3F represent control characters, 0x41 a nonbreaking space, and the
! 617: * remainder printing characters.
! 618: *
! 619: * This is sufficient to allow us to identify EBCDIC text and to distinguish
! 620: * between old-style and internationalized examples of text.
! 621: */
! 622:
! 623: private unsigned char ebcdic_to_ascii[] = {
! 624: 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
! 625: 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
! 626: 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
! 627: 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
! 628: ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
! 629: '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
! 630: '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
! 631: 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
! 632: 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
! 633: 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
! 634: 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
! 635: 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
! 636: '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
! 637: '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
! 638: '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
! 639: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
! 640: };
! 641:
! 642: #ifdef notdef
! 643: /*
! 644: * The following EBCDIC-to-ASCII table may relate more closely to reality,
! 645: * or at least to modern reality. It comes from
! 646: *
! 647: * http://ftp.s390.ibm.com/products/oe/bpxqp9.html
! 648: *
! 649: * and maps the characters of EBCDIC code page 1047 (the code used for
! 650: * Unix-derived software on IBM's 390 systems) to the corresponding
! 651: * characters from ISO 8859-1.
! 652: *
! 653: * If this table is used instead of the above one, some of the special
! 654: * cases for the NEL character can be taken out of the code.
! 655: */
1.1 deraadt 656:
1.8 ! tedu 657: private unsigned char ebcdic_1047_to_8859[] = {
! 658: 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
! 659: 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
! 660: 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
! 661: 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
! 662: 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
! 663: 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
! 664: 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
! 665: 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
! 666: 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
! 667: 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
! 668: 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
! 669: 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
! 670: 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
! 671: 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
! 672: 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
! 673: 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
! 674: };
! 675: #endif
! 676:
! 677: /*
! 678: * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
! 679: */
! 680: private void
! 681: from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
! 682: {
! 683: int i;
! 684:
! 685: for (i = 0; i < nbytes; i++) {
! 686: out[i] = ebcdic_to_ascii[buf[i]];
! 687: }
! 688: }