Annotation of src/usr.bin/file/text.c, Revision 1.2
1.2 ! nicm 1: /* $OpenBSD: text.c,v 1.1 2015/04/24 16:24:11 nicm Exp $ */
1.1 nicm 2:
3: /*
4: * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
21: #include <ctype.h>
22: #include <string.h>
23:
24: #include "file.h"
25: #include "magic.h"
26: #include "xmalloc.h"
27:
28: static const char *text_words[][3] = {
29: { "msgid", "PO (gettext message catalogue)", "text/x-po" },
30: { "dnl", "M4 macro language pre-processor", "text/x-m4" },
31: { "import", "Java program", "text/x-java" },
32: { "\"libhdr\"", "BCPL program", "text/x-bcpl" },
33: { "\"LIBHDR\"", "BCPL program", "text/x-bcpl" },
34: { "//", "C++ program", "text/x-c++" },
35: { "virtual", "C++ program", "text/x-c++" },
36: { "class", "C++ program", "text/x-c++" },
37: { "public:", "C++ program", "text/x-c++" },
38: { "private:", "C++ program", "text/x-c++" },
39: { "/*", "C program", "text/x-c" },
40: { "#include", "C program", "text/x-c" },
41: { "char", "C program", "text/x-c" },
42: { "The", "English", "text/plain" },
43: { "the", "English", "text/plain" },
44: { "double", "C program", "text/x-c" },
45: { "extern", "C program", "text/x-c" },
46: { "float", "C program", "text/x-c" },
47: { "struct", "C program", "text/x-c" },
48: { "union", "C program", "text/x-c" },
49: { "CFLAGS", "make commands", "text/x-makefile" },
50: { "LDFLAGS", "make commands", "text/x-makefile" },
51: { "all:", "make commands", "text/x-makefile" },
52: { ".PRECIOUS", "make commands", "text/x-makefile" },
53: { ".ascii", "assembler program", "text/x-asm" },
54: { ".asciiz", "assembler program", "text/x-asm" },
55: { ".byte", "assembler program", "text/x-asm" },
56: { ".even", "assembler program", "text/x-asm" },
57: { ".globl", "assembler program", "text/x-asm" },
58: { ".text", "assembler program", "text/x-asm" },
59: { "clr", "assembler program", "text/x-asm" },
60: { "(input", "Pascal program", "text/x-pascal" },
61: { "program", "Pascal program", "text/x-pascal" },
62: { "record", "Pascal program", "text/x-pascal" },
63: { "dcl", "PL/1 program", "text/x-pl1" },
64: { "Received:", "mail", "text/x-mail" },
65: { ">From", "mail", "text/x-mail" },
66: { "Return-Path:", "mail", "text/x-mail" },
67: { "Cc:", "mail", "text/x-mail" },
68: { "Newsgroups:", "news", "text/x-news" },
69: { "Path:", "news", "text/x-news" },
70: { "Organization:", "news", "text/x-news" },
71: { "href=", "HTML document", "text/html" },
72: { "HREF=", "HTML document", "text/html" },
73: { "<body", "HTML document", "text/html" },
74: { "<BODY", "HTML document", "text/html" },
75: { "<html", "HTML document", "text/html" },
76: { "<HTML", "HTML document", "text/html" },
77: { "<!--", "HTML document", "text/html" },
78: { NULL, NULL, NULL }
79: };
80:
81: static int
82: text_is_ascii(u_char c)
83: {
84: const char cc[] = "\007\010\011\012\014\015\033";
85:
86: if (c == '\0')
87: return (0);
88: if (strchr(cc, c) != NULL)
89: return (1);
90: return (c > 31 && c < 127);
91: }
92:
93: static int
94: text_is_latin1(u_char c)
95: {
96: if (c >= 160)
97: return (1);
98: return (text_is_ascii(c));
99: }
100:
101: static int
102: text_is_extended(u_char c)
103: {
104: if (c >= 128)
105: return (1);
106: return (text_is_ascii(c));
107: }
108:
109: static int
110: text_try_test(const void *base, size_t size, int (*f)(u_char))
111: {
112: const u_char *data = base;
113: size_t offset;
114:
115: for (offset = 0; offset < size; offset++) {
116: if (!f(data[offset]))
117: return (0);
118: }
119: return (1);
120: }
121:
122: const char *
123: text_get_type(const void *base, size_t size)
124: {
125: if (text_try_test(base, size, text_is_ascii))
126: return ("ASCII");
127: if (text_try_test(base, size, text_is_latin1))
128: return ("ISO-8859");
129: if (text_try_test(base, size, text_is_extended))
130: return ("Non-ISO extended-ASCII");
131: return (NULL);
132: }
133:
134: const char *
135: text_try_words(const void *base, size_t size, int flags)
136: {
137: const char *cp, *end, *next, *word;
138: size_t wordlen;
139: u_int i;
140:
1.2 ! nicm 141: end = (char *)base + size;
1.1 nicm 142: for (cp = base; cp != end; /* nothing */) {
143: while (cp != end && isspace((u_char)*cp))
144: cp++;
145:
146: next = cp;
147: while (next != end && !isspace((u_char)*next))
148: next++;
149:
150: for (i = 0; /* nothing */; i++) {
151: word = text_words[i][0];
152: if (word == NULL)
153: break;
154: wordlen = strlen(word);
155:
156: if ((size_t)(next - cp) != wordlen)
157: continue;
158: if (memcmp(cp, word, wordlen) != 0)
159: continue;
160: if (flags & MAGIC_TEST_MIME)
161: return (text_words[i][2]);
162: return (text_words[i][1]);
163: }
164:
165: cp = next;
166: }
167: return (NULL);
168: }