=================================================================== RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/file/Attic/ascmagic.c,v retrieving revision 1.9 retrieving revision 1.10 diff -c -r1.9 -r1.10 *** src/usr.bin/file/Attic/ascmagic.c 2008/05/08 01:40:56 1.9 --- src/usr.bin/file/Attic/ascmagic.c 2009/04/24 18:54:34 1.10 *************** *** 1,4 **** ! /* $OpenBSD: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $ */ /* * Copyright (c) Ian F. Darwin 1986-1995. * Software written by Ian F. Darwin and others; --- 1,4 ---- ! /* $OpenBSD: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $ */ /* * Copyright (c) Ian F. Darwin 1986-1995. * Software written by Ian F. Darwin and others; *************** *** 50,82 **** #include "names.h" #ifndef lint ! FILE_RCSID("@(#)$Id: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $") #endif /* lint */ - typedef unsigned long unichar; - #define MAXLINELEN 300 /* longest sane line length */ #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ || (x) == 0x85 || (x) == '\f') private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); ! private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *); ! private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *); private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); private void from_ebcdic(const unsigned char *, size_t, unsigned char *); private int ascmatch(const unsigned char *, const unichar *, size_t); protected int file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) { size_t i; ! unsigned char *nbuf = NULL; unichar *ubuf = NULL; ! size_t ulen; ! struct names *p; int rv = -1; const char *code = NULL; const char *code_mime = NULL; --- 50,84 ---- #include "names.h" #ifndef lint ! FILE_RCSID("@(#)$Id: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $") #endif /* lint */ #define MAXLINELEN 300 /* longest sane line length */ #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ || (x) == 0x85 || (x) == '\f') private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); ! private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, ! size_t *); ! protected int file_looks_utf8(const unsigned char *, size_t, unichar *, size_t *); ! private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); private void from_ebcdic(const unsigned char *, size_t, unsigned char *); private int ascmatch(const unsigned char *, const unichar *, size_t); + private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t); protected int file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) { size_t i; ! unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end; unichar *ubuf = NULL; ! size_t ulen, mlen; ! const struct names *p; int rv = -1; + int mime = ms->flags & MAGIC_MIME; const char *code = NULL; const char *code_mime = NULL; *************** *** 118,128 **** code = "ASCII"; code_mime = "us-ascii"; type = "text"; ! } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; ! } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else --- 120,134 ---- code = "ASCII"; code_mime = "us-ascii"; type = "text"; ! } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) { ! code = "UTF-8 Unicode (with BOM)"; ! code_mime = "utf-8"; ! type = "text"; ! } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) { code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; ! } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else *************** *** 160,199 **** goto done; } ! /* ! * for troff, look for . + letter + letter or .\"; ! * this must be done to disambiguate tar archives' ./file ! * and other trash from real troff input. ! * ! * I believe Plan 9 troff allows non-ASCII characters in the names ! * of macros, so this test might possibly fail on such a file. ! */ ! if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') { ! unichar *tp = ubuf + 1; ! ! while (ISSPC(*tp)) ! ++tp; /* skip leading whitespace */ ! if ((tp[0] == '\\' && tp[1] == '\"') || ! (isascii((unsigned char)tp[0]) && ! isalnum((unsigned char)tp[0]) && ! isascii((unsigned char)tp[1]) && ! isalnum((unsigned char)tp[1]) && ! ISSPC(tp[2]))) { ! subtype_mime = "text/troff"; ! subtype = "troff or preprocessor input"; ! goto subtype_identified; ! } } ! ! if ((ms->flags & MAGIC_NO_CHECK_FORTRAN) == 0 && ! (*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { ! subtype_mime = "text/fortran"; ! subtype = "fortran program"; ! goto subtype_identified; } /* look for tokens from names.h - this is expensive! */ - if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0) goto subtype_identified; --- 166,190 ---- goto done; } ! /* Convert ubuf to UTF-8 and try text soft magic */ ! /* If original was ASCII or UTF-8, could use nbuf instead of ! re-converting. */ ! /* malloc size is a conservative overestimate; could be ! re-converting improved, or at least realloced after ! re-converting conversion. */ ! mlen = ulen * 6; ! if ((utf8_buf = malloc(mlen)) == NULL) { ! file_oomem(ms, mlen); ! goto done; } ! if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL) ! goto done; ! if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) { ! rv = 1; ! goto done; } /* look for tokens from names.h - this is expensive! */ if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0) goto subtype_identified; *************** *** 201,224 **** while (i < ulen) { size_t end; ! /* ! * skip past any leading space ! */ while (i < ulen && ISSPC(ubuf[i])) i++; if (i >= ulen) break; ! /* ! * find the next whitespace ! */ for (end = i + 1; end < nbytes; end++) if (ISSPC(ubuf[end])) break; ! /* ! * compare the word thus isolated against the token list ! */ for (p = names; p < names + NNAMES; p++) { if (ascmatch((const unsigned char *)p->name, ubuf + i, end - i)) { --- 192,209 ---- while (i < ulen) { size_t end; ! /* skip past any leading space */ while (i < ulen && ISSPC(ubuf[i])) i++; if (i >= ulen) break; ! /* find the next whitespace */ for (end = i + 1; end < nbytes; end++) if (ISSPC(ubuf[end])) break; ! /* compare the word thus isolated against the token list */ for (p = names; p < names + NNAMES; p++) { if (ascmatch((const unsigned char *)p->name, ubuf + i, end - i)) { *************** *** 233,241 **** subtype_identified: ! /* ! * Now try to discover other details about the file. ! */ for (i = 0; i < ulen; i++) { if (ubuf[i] == '\n') { if (seen_cr) --- 218,224 ---- subtype_identified: ! /* Now try to discover other details about the file. */ for (i = 0; i < ulen; i++) { if (ubuf[i] == '\n') { if (seen_cr) *************** *** 272,292 **** if (seen_cr && nbytes < HOWMANY) n_cr++; ! if ((ms->flags & MAGIC_MIME)) { ! if (subtype_mime) { ! if (file_printf(ms, subtype_mime) == -1) ! goto done; ! } else { ! if (file_printf(ms, "text/plain") == -1) ! goto done; } ! if (code_mime) { ! if (file_printf(ms, "; charset=") == -1) goto done; if (file_printf(ms, code_mime) == -1) goto done; } } else { if (file_printf(ms, code) == -1) goto done; --- 255,281 ---- if (seen_cr && nbytes < HOWMANY) n_cr++; ! if (mime) { ! if (mime & MAGIC_MIME_TYPE) { ! if (subtype_mime) { ! if (file_printf(ms, subtype_mime) == -1) ! goto done; ! } else { ! if (file_printf(ms, "text/plain") == -1) ! goto done; ! } } ! if ((mime == 0 || mime == MAGIC_MIME) && code_mime) { ! if ((mime & MAGIC_MIME_TYPE) && ! file_printf(ms, " charset=") == -1) goto done; if (file_printf(ms, code_mime) == -1) goto done; } + + if (mime == MAGIC_MIME_ENCODING) + file_printf(ms, "binary"); } else { if (file_printf(ms, code) == -1) goto done; *************** *** 363,368 **** --- 352,359 ---- free(nbuf); if (ubuf) free(ubuf); + if (utf8_buf) + free(utf8_buf); return rv; } *************** *** 521,535 **** return 1; } ! private int ! looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) { size_t i; int n; unichar c; ! int gotone = 0; ! *ulen = 0; for (i = 0; i < nbytes; i++) { if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ --- 512,595 ---- return 1; } ! /* ! * Encode Unicode string as UTF-8, returning pointer to character ! * after end of string, or NULL if an invalid character is found. ! */ ! private unsigned char * ! encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen) { size_t i; + unsigned char *end = buf + len; + + for (i = 0; i < ulen; i++) { + if (ubuf[i] <= 0x7f) { + if (end - buf < 1) + return NULL; + *buf++ = (unsigned char)ubuf[i]; + } else if (ubuf[i] <= 0x7ff) { + if (end - buf < 2) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0xffff) { + if (end - buf < 3) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x1fffff) { + if (end - buf < 4) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x3ffffff) { + if (end - buf < 5) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8); + *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x7fffffff) { + if (end - buf < 6) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc); + *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else /* Invalid character */ + return NULL; + } + + return buf; + } + + /* + * Decide whether some text looks like UTF-8. Returns: + * + * -1: invalid UTF-8 + * 0: uses odd control characters, so doesn't look like text + * 1: 7-bit text + * 2: definitely UTF-8 text (valid high-bit set bytes) + * + * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; + * ubuf must be big enough! + */ + protected int + file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) + { + size_t i; int n; unichar c; ! int gotone = 0, ctrl = 0; ! if (ubuf) ! *ulen = 0; for (i = 0; i < nbytes; i++) { if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ *************** *** 539,549 **** */ if (text_chars[buf[i]] != T) ! return 0; ! ubuf[(*ulen)++] = buf[i]; } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ ! return 0; } else { /* 11xxxxxx begins UTF-8 */ int following; --- 599,610 ---- */ if (text_chars[buf[i]] != T) ! ctrl = 1; ! if (ubuf) ! ubuf[(*ulen)++] = buf[i]; } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ ! return -1; } else { /* 11xxxxxx begins UTF-8 */ int following; *************** *** 563,569 **** c = buf[i] & 0x01; following = 5; } else ! return 0; for (n = 0; n < following; n++) { i++; --- 624,630 ---- c = buf[i] & 0x01; following = 5; } else ! return -1; for (n = 0; n < following; n++) { i++; *************** *** 571,591 **** goto done; if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) ! return 0; c = (c << 6) + (buf[i] & 0x3f); } ! ubuf[(*ulen)++] = c; gotone = 1; } } done: ! return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ } private int ! looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) { int bigend; --- 632,668 ---- goto done; if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) ! return -1; c = (c << 6) + (buf[i] & 0x3f); } ! if (ubuf) ! ubuf[(*ulen)++] = c; gotone = 1; } } done: ! return ctrl ? 0 : (gotone ? 2 : 1); } + /* + * Decide whether some text looks like UTF-8 with BOM. If there is no + * BOM, return -1; otherwise return the result of looks_utf8 on the + * rest of the text. + */ private int ! looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf, ! size_t *ulen) ! { ! if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) ! return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); ! else ! return -1; ! } ! ! private int ! looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) { int bigend;