===================================================================
RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/file/Attic/ascmagic.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -c -r1.9 -r1.10
*** src/usr.bin/file/Attic/ascmagic.c	2008/05/08 01:40:56	1.9
--- src/usr.bin/file/Attic/ascmagic.c	2009/04/24 18:54:34	1.10
***************
*** 1,4 ****
! /*	$OpenBSD: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $ */
  /*
   * Copyright (c) Ian F. Darwin 1986-1995.
   * Software written by Ian F. Darwin and others;
--- 1,4 ----
! /*	$OpenBSD: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $ */
  /*
   * Copyright (c) Ian F. Darwin 1986-1995.
   * Software written by Ian F. Darwin and others;
***************
*** 50,82 ****
  #include "names.h"
  
  #ifndef	lint
! FILE_RCSID("@(#)$Id: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $")
  #endif	/* lint */
  
- typedef unsigned long unichar;
- 
  #define MAXLINELEN 300	/* longest sane line length */
  #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
  		  || (x) == 0x85 || (x) == '\f')
  
  private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
! private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
! private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
  private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
  private int ascmatch(const unsigned char *, const unichar *, size_t);
  
  
  protected int
  file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
  {
  	size_t i;
! 	unsigned char *nbuf = NULL;
  	unichar *ubuf = NULL;	
! 	size_t ulen;
! 	struct names *p;
  	int rv = -1;
  
  	const char *code = NULL;
  	const char *code_mime = NULL;
--- 50,84 ----
  #include "names.h"
  
  #ifndef	lint
! FILE_RCSID("@(#)$Id: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $")
  #endif	/* lint */
  
  #define MAXLINELEN 300	/* longest sane line length */
  #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
  		  || (x) == 0x85 || (x) == '\f')
  
  private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
! private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
!     size_t *);
! protected int file_looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
! private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
  private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
  private int ascmatch(const unsigned char *, const unichar *, size_t);
+ private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
  
  
  protected int
  file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
  {
  	size_t i;
! 	unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end;
  	unichar *ubuf = NULL;	
! 	size_t ulen, mlen;
! 	const struct names *p;
  	int rv = -1;
+ 	int mime = ms->flags & MAGIC_MIME;
  
  	const char *code = NULL;
  	const char *code_mime = NULL;
***************
*** 118,128 ****
  		code = "ASCII";
  		code_mime = "us-ascii";
  		type = "text";
! 	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
  		code = "UTF-8 Unicode";
  		code_mime = "utf-8";
  		type = "text";
! 	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
  		if (i == 1)
  			code = "Little-endian UTF-16 Unicode";
  		else
--- 120,134 ----
  		code = "ASCII";
  		code_mime = "us-ascii";
  		type = "text";
! 	} else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) {
! 		code = "UTF-8 Unicode (with BOM)";
! 		code_mime = "utf-8";
! 		type = "text";
! 	} else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) {
  		code = "UTF-8 Unicode";
  		code_mime = "utf-8";
  		type = "text";
! 	} else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) {
  		if (i == 1)
  			code = "Little-endian UTF-16 Unicode";
  		else
***************
*** 160,199 ****
  		goto done;
  	}
  
! 	/*
! 	 * for troff, look for . + letter + letter or .\";
! 	 * this must be done to disambiguate tar archives' ./file
! 	 * and other trash from real troff input.
! 	 *
! 	 * I believe Plan 9 troff allows non-ASCII characters in the names
! 	 * of macros, so this test might possibly fail on such a file.
! 	 */
! 	if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') {
! 		unichar *tp = ubuf + 1;
! 
! 		while (ISSPC(*tp))
! 			++tp;	/* skip leading whitespace */
! 		if ((tp[0] == '\\' && tp[1] == '\"') ||
! 		    (isascii((unsigned char)tp[0]) &&
! 		     isalnum((unsigned char)tp[0]) &&
! 		     isascii((unsigned char)tp[1]) &&
! 		     isalnum((unsigned char)tp[1]) &&
! 		     ISSPC(tp[2]))) {
! 			subtype_mime = "text/troff";
! 			subtype = "troff or preprocessor input";
! 			goto subtype_identified;
! 		}
  	}
! 
! 	if ((ms->flags & MAGIC_NO_CHECK_FORTRAN) == 0 &&
! 	    (*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
! 		subtype_mime = "text/fortran";
! 		subtype = "fortran program";
! 		goto subtype_identified;
  	}
  
  	/* look for tokens from names.h - this is expensive! */
- 
  	if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
  		goto subtype_identified;
  
--- 166,190 ----
  		goto done;
  	}
  
! 	/* Convert ubuf to UTF-8 and try text soft magic */
! 	/* If original was ASCII or UTF-8, could use nbuf instead of
! 	   re-converting. */
! 	/* malloc size is a conservative overestimate; could be
! 	   re-converting improved, or at least realloced after
! 	   re-converting conversion. */
! 	mlen = ulen * 6;
! 	if ((utf8_buf = malloc(mlen)) == NULL) {
! 		file_oomem(ms, mlen);
! 		goto done;
  	}
! 	if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
! 		goto done;
! 	if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) {
! 		rv = 1;
! 		goto done;
  	}
  
  	/* look for tokens from names.h - this is expensive! */
  	if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
  		goto subtype_identified;
  
***************
*** 201,224 ****
  	while (i < ulen) {
  		size_t end;
  
! 		/*
! 		 * skip past any leading space
! 		 */
  		while (i < ulen && ISSPC(ubuf[i]))
  			i++;
  		if (i >= ulen)
  			break;
  
! 		/*
! 		 * find the next whitespace
! 		 */
  		for (end = i + 1; end < nbytes; end++)
  			if (ISSPC(ubuf[end]))
  				break;
  
! 		/*
! 		 * compare the word thus isolated against the token list
! 		 */
  		for (p = names; p < names + NNAMES; p++) {
  			if (ascmatch((const unsigned char *)p->name, ubuf + i,
  			    end - i)) {
--- 192,209 ----
  	while (i < ulen) {
  		size_t end;
  
! 		/* skip past any leading space */
  		while (i < ulen && ISSPC(ubuf[i]))
  			i++;
  		if (i >= ulen)
  			break;
  
! 		/* find the next whitespace */
  		for (end = i + 1; end < nbytes; end++)
  			if (ISSPC(ubuf[end]))
  				break;
  
! 		/* compare the word thus isolated against the token list */
  		for (p = names; p < names + NNAMES; p++) {
  			if (ascmatch((const unsigned char *)p->name, ubuf + i,
  			    end - i)) {
***************
*** 233,241 ****
  
  subtype_identified:
  
! 	/*
! 	 * Now try to discover other details about the file.
! 	 */
  	for (i = 0; i < ulen; i++) {
  		if (ubuf[i] == '\n') {
  			if (seen_cr)
--- 218,224 ----
  
  subtype_identified:
  
! 	/* Now try to discover other details about the file. */
  	for (i = 0; i < ulen; i++) {
  		if (ubuf[i] == '\n') {
  			if (seen_cr)
***************
*** 272,292 ****
  	if (seen_cr && nbytes < HOWMANY)
  		n_cr++;
  
! 	if ((ms->flags & MAGIC_MIME)) {
! 		if (subtype_mime) {
! 			if (file_printf(ms, subtype_mime) == -1)
! 				goto done;
! 		} else {
! 			if (file_printf(ms, "text/plain") == -1)
! 				goto done;
  		}
  
! 		if (code_mime) {
! 			if (file_printf(ms, "; charset=") == -1)
  				goto done;
  			if (file_printf(ms, code_mime) == -1)
  				goto done;
  		}
  	} else {
  		if (file_printf(ms, code) == -1)
  			goto done;
--- 255,281 ----
  	if (seen_cr && nbytes < HOWMANY)
  		n_cr++;
  
! 	if (mime) {
! 		if (mime & MAGIC_MIME_TYPE) {
! 			if (subtype_mime) {
! 				if (file_printf(ms, subtype_mime) == -1)
! 					goto done;
! 			} else {
! 				if (file_printf(ms, "text/plain") == -1)
! 					goto done;
! 			}
  		}
  
! 		if ((mime == 0 || mime == MAGIC_MIME) && code_mime) {
! 			if ((mime & MAGIC_MIME_TYPE) &&
! 			    file_printf(ms, " charset=") == -1)
  				goto done;
  			if (file_printf(ms, code_mime) == -1)
  				goto done;
  		}
+ 
+ 		if (mime == MAGIC_MIME_ENCODING)
+ 			file_printf(ms, "binary");
  	} else {
  		if (file_printf(ms, code) == -1)
  			goto done;
***************
*** 363,368 ****
--- 352,359 ----
  		free(nbuf);
  	if (ubuf)
  		free(ubuf);
+ 	if (utf8_buf)
+ 		free(utf8_buf);
  
  	return rv;
  }
***************
*** 521,535 ****
  	return 1;
  }
  
! private int
! looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
  {
  	size_t i;
  	int n;
  	unichar c;
! 	int gotone = 0;
  
! 	*ulen = 0;
  
  	for (i = 0; i < nbytes; i++) {
  		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
--- 512,595 ----
  	return 1;
  }
  
! /*
!  * Encode Unicode string as UTF-8, returning pointer to character
!  * after end of string, or NULL if an invalid character is found.
!  */
! private unsigned char *
! encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
  {
  	size_t i;
+ 	unsigned char *end = buf + len;
+ 
+ 	for (i = 0; i < ulen; i++) {
+ 		if (ubuf[i] <= 0x7f) {
+ 			if (end - buf < 1)
+ 				return NULL;
+ 			*buf++ = (unsigned char)ubuf[i];
+ 		} else if (ubuf[i] <= 0x7ff) {
+ 			if (end - buf < 2)
+ 				return NULL;
+ 			*buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
+ 			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ 		} else if (ubuf[i] <= 0xffff) {
+ 			if (end - buf < 3)
+ 				return NULL;
+ 			*buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
+ 			*buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ 		} else if (ubuf[i] <= 0x1fffff) {
+ 			if (end - buf < 4)
+ 				return NULL;
+ 			*buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
+ 			*buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ 		} else if (ubuf[i] <= 0x3ffffff) {
+ 			if (end - buf < 5)
+ 				return NULL;
+ 			*buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
+ 			*buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ 		} else if (ubuf[i] <= 0x7fffffff) {
+ 			if (end - buf < 6)
+ 				return NULL;
+ 			*buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
+ 			*buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
+ 			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ 		} else /* Invalid character */
+ 			return NULL;
+ 	}
+ 
+ 	return buf;
+ }
+ 
+ /*
+  * Decide whether some text looks like UTF-8. Returns:
+  *
+  *     -1: invalid UTF-8
+  *      0: uses odd control characters, so doesn't look like text
+  *      1: 7-bit text
+  *      2: definitely UTF-8 text (valid high-bit set bytes)
+  *
+  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
+  * ubuf must be big enough!
+  */
+ protected int
+ file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
+ {
+ 	size_t i;
  	int n;
  	unichar c;
! 	int gotone = 0, ctrl = 0;
  
! 	if (ubuf)
! 		*ulen = 0;
  
  	for (i = 0; i < nbytes; i++) {
  		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
***************
*** 539,549 ****
  			 */
  
  			if (text_chars[buf[i]] != T)
! 				return 0;
  
! 			ubuf[(*ulen)++] = buf[i];
  		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
! 			return 0;
  		} else {			   /* 11xxxxxx begins UTF-8 */
  			int following;
  
--- 599,610 ----
  			 */
  
  			if (text_chars[buf[i]] != T)
! 				ctrl = 1;
  
! 			if (ubuf)
! 				ubuf[(*ulen)++] = buf[i];
  		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
! 			return -1;
  		} else {			   /* 11xxxxxx begins UTF-8 */
  			int following;
  
***************
*** 563,569 ****
  				c = buf[i] & 0x01;
  				following = 5;
  			} else
! 				return 0;
  
  			for (n = 0; n < following; n++) {
  				i++;
--- 624,630 ----
  				c = buf[i] & 0x01;
  				following = 5;
  			} else
! 				return -1;
  
  			for (n = 0; n < following; n++) {
  				i++;
***************
*** 571,591 ****
  					goto done;
  
  				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
! 					return 0;
  
  				c = (c << 6) + (buf[i] & 0x3f);
  			}
  
! 			ubuf[(*ulen)++] = c;
  			gotone = 1;
  		}
  	}
  done:
! 	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
  }
  
  private int
! looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
      size_t *ulen)
  {
  	int bigend;
--- 632,668 ----
  					goto done;
  
  				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
! 					return -1;
  
  				c = (c << 6) + (buf[i] & 0x3f);
  			}
  
! 			if (ubuf)
! 				ubuf[(*ulen)++] = c;
  			gotone = 1;
  		}
  	}
  done:
! 	return ctrl ? 0 : (gotone ? 2 : 1);
  }
  
+ /*
+  * Decide whether some text looks like UTF-8 with BOM. If there is no
+  * BOM, return -1; otherwise return the result of looks_utf8 on the
+  * rest of the text.
+  */
  private int
! looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
!     size_t *ulen)
! {
! 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
! 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
! 	else
! 		return -1;
! }
! 
! private int
! looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
      size_t *ulen)
  {
  	int bigend;