Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.28
1.28 ! schwarze 1: /* $Id: mandoc.c,v 1.27 2011/09/18 10:25:28 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.21 schwarze 4: * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.1 schwarze 30: #include "libmandoc.h"
31:
1.22 schwarze 32: #define DATESIZE 32
33:
1.14 schwarze 34: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 35: static char *time2a(time_t);
1.26 schwarze 36: static int numescape(const char *);
1.5 schwarze 37:
1.26 schwarze 38: /*
39: * Pass over recursive numerical expressions. This context of this
40: * function is important: it's only called within character-terminating
41: * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
42: * recursion: we don't care about what's in these blocks.
43: * This returns the number of characters skipped or -1 if an error
44: * occurs (the caller should bail).
45: */
46: static int
47: numescape(const char *start)
1.1 schwarze 48: {
1.26 schwarze 49: int i;
50: size_t sz;
51: const char *cp;
52:
53: i = 0;
54:
55: /* The expression consists of a subexpression. */
56:
57: if ('\\' == start[i]) {
58: cp = &start[++i];
59: /*
60: * Read past the end of the subexpression.
61: * Bail immediately on errors.
62: */
63: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
64: return(-1);
65: return(i + cp - &start[i]);
66: }
67:
68: if ('(' != start[i++])
69: return(0);
1.14 schwarze 70:
1.26 schwarze 71: /*
72: * A parenthesised subexpression. Read until the closing
73: * parenthesis, making sure to handle any nested subexpressions
74: * that might ruin our parse.
75: */
76:
77: while (')' != start[i]) {
78: sz = strcspn(&start[i], ")\\");
79: i += (int)sz;
80:
81: if ('\0' == start[i])
82: return(-1);
83: else if ('\\' != start[i])
84: continue;
85:
86: cp = &start[++i];
87: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
88: return(-1);
89: i += cp - &start[i];
90: }
91:
92: /* Read past the terminating ')'. */
93: return(++i);
94: }
95:
96: enum mandoc_esc
97: mandoc_escape(const char **end, const char **start, int *sz)
98: {
99: char c, term, numeric;
100: int i, lim, ssz, rlim;
101: const char *cp, *rstart;
102: enum mandoc_esc gly;
103:
104: cp = *end;
105: rstart = cp;
106: if (start)
107: *start = rstart;
108: i = lim = 0;
109: gly = ESCAPE_ERROR;
110: term = numeric = '\0';
111:
112: switch ((c = cp[i++])) {
113: /*
114: * First the glyphs. There are several different forms of
115: * these, but each eventually returns a substring of the glyph
116: * name.
117: */
118: case ('('):
119: gly = ESCAPE_SPECIAL;
120: lim = 2;
121: break;
122: case ('['):
123: gly = ESCAPE_SPECIAL;
124: /*
125: * Unicode escapes are defined in groff as \[uXXXX] to
126: * \[u10FFFF], where the contained value must be a valid
127: * Unicode codepoint. Here, however, only check whether
128: * it's not a zero-width escape.
129: */
130: if ('u' == cp[i] && ']' != cp[i + 1])
131: gly = ESCAPE_UNICODE;
132: term = ']';
133: break;
134: case ('C'):
135: if ('\'' != cp[i])
136: return(ESCAPE_ERROR);
137: gly = ESCAPE_SPECIAL;
138: term = '\'';
139: break;
1.1 schwarze 140:
1.26 schwarze 141: /*
142: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
143: * 'X' is the trigger. These have opaque sub-strings.
144: */
145: case ('F'):
1.16 schwarze 146: /* FALLTHROUGH */
1.26 schwarze 147: case ('g'):
1.16 schwarze 148: /* FALLTHROUGH */
1.26 schwarze 149: case ('k'):
1.1 schwarze 150: /* FALLTHROUGH */
1.26 schwarze 151: case ('M'):
1.14 schwarze 152: /* FALLTHROUGH */
1.26 schwarze 153: case ('m'):
1.1 schwarze 154: /* FALLTHROUGH */
1.26 schwarze 155: case ('n'):
1.1 schwarze 156: /* FALLTHROUGH */
1.26 schwarze 157: case ('V'):
1.1 schwarze 158: /* FALLTHROUGH */
1.26 schwarze 159: case ('Y'):
160: if (ESCAPE_ERROR == gly)
161: gly = ESCAPE_IGNORE;
1.1 schwarze 162: /* FALLTHROUGH */
1.26 schwarze 163: case ('f'):
164: if (ESCAPE_ERROR == gly)
165: gly = ESCAPE_FONT;
166:
167: rstart= &cp[i];
168: if (start)
169: *start = rstart;
170:
171: switch (cp[i++]) {
172: case ('('):
173: lim = 2;
174: break;
175: case ('['):
176: term = ']';
177: break;
178: default:
179: lim = 1;
180: i--;
181: break;
182: }
183: break;
184:
185: /*
186: * These escapes are of the form \X'Y', where 'X' is the trigger
187: * and 'Y' is any string. These have opaque sub-strings.
188: */
189: case ('A'):
1.13 schwarze 190: /* FALLTHROUGH */
1.26 schwarze 191: case ('b'):
1.1 schwarze 192: /* FALLTHROUGH */
1.16 schwarze 193: case ('D'):
1.1 schwarze 194: /* FALLTHROUGH */
1.26 schwarze 195: case ('o'):
1.1 schwarze 196: /* FALLTHROUGH */
1.26 schwarze 197: case ('R'):
1.1 schwarze 198: /* FALLTHROUGH */
1.26 schwarze 199: case ('X'):
1.1 schwarze 200: /* FALLTHROUGH */
1.26 schwarze 201: case ('Z'):
202: if ('\'' != cp[i++])
203: return(ESCAPE_ERROR);
204: gly = ESCAPE_IGNORE;
1.16 schwarze 205: term = '\'';
206: break;
1.26 schwarze 207:
208: /*
209: * These escapes are of the form \X'N', where 'X' is the trigger
210: * and 'N' resolves to a numerical expression.
211: */
212: case ('B'):
213: /* FALLTHROUGH */
1.17 schwarze 214: case ('h'):
215: /* FALLTHROUGH */
1.26 schwarze 216: case ('H'):
217: /* FALLTHROUGH */
218: case ('L'):
219: /* FALLTHROUGH */
220: case ('l'):
221: /* FALLTHROUGH */
222: case ('N'):
223: if (ESCAPE_ERROR == gly)
224: gly = ESCAPE_NUMBERED;
225: /* FALLTHROUGH */
226: case ('S'):
227: /* FALLTHROUGH */
1.17 schwarze 228: case ('v'):
229: /* FALLTHROUGH */
1.26 schwarze 230: case ('w'):
231: /* FALLTHROUGH */
232: case ('x'):
233: if (ESCAPE_ERROR == gly)
234: gly = ESCAPE_IGNORE;
235: if ('\'' != cp[i++])
236: return(ESCAPE_ERROR);
237: term = numeric = '\'';
238: break;
239:
240: /*
241: * Sizes get a special category of their own.
242: */
1.6 schwarze 243: case ('s'):
1.26 schwarze 244: gly = ESCAPE_IGNORE;
1.17 schwarze 245:
1.26 schwarze 246: rstart = &cp[i];
247: if (start)
248: *start = rstart;
249:
250: /* See +/- counts as a sign. */
251: c = cp[i];
252: if ('+' == c || '-' == c || ASCII_HYPH == c)
253: ++i;
1.6 schwarze 254:
1.26 schwarze 255: switch (cp[i++]) {
1.16 schwarze 256: case ('('):
1.26 schwarze 257: lim = 2;
1.16 schwarze 258: break;
259: case ('['):
1.26 schwarze 260: term = numeric = ']';
1.16 schwarze 261: break;
262: case ('\''):
1.26 schwarze 263: term = numeric = '\'';
1.16 schwarze 264: break;
265: default:
1.26 schwarze 266: lim = 1;
267: i--;
1.16 schwarze 268: break;
1.6 schwarze 269: }
270:
1.26 schwarze 271: /* See +/- counts as a sign. */
272: c = cp[i];
273: if ('+' == c || '-' == c || ASCII_HYPH == c)
274: ++i;
275:
276: break;
277:
278: /*
279: * Anything else is assumed to be a glyph.
280: */
281: default:
282: gly = ESCAPE_SPECIAL;
283: lim = 1;
284: i--;
285: break;
286: }
287:
288: assert(ESCAPE_ERROR != gly);
289:
290: rstart = &cp[i];
291: if (start)
292: *start = rstart;
293:
294: /*
295: * If a terminating block has been specified, we need to
296: * handle the case of recursion, which could have their
297: * own terminating blocks that mess up our parse. This, by the
298: * way, means that the "start" and "size" values will be
299: * effectively meaningless.
300: */
301:
302: ssz = 0;
303: if (numeric && -1 == (ssz = numescape(&cp[i])))
304: return(ESCAPE_ERROR);
305:
306: i += ssz;
307: rlim = -1;
308:
309: /*
310: * We have a character terminator. Try to read up to that
311: * character. If we can't (i.e., we hit the nil), then return
312: * an error; if we can, calculate our length, read past the
313: * terminating character, and exit.
314: */
315:
316: if ('\0' != term) {
317: *end = strchr(&cp[i], term);
318: if ('\0' == *end)
319: return(ESCAPE_ERROR);
320:
321: rlim = *end - &cp[i];
322: if (sz)
323: *sz = rlim;
324: (*end)++;
325: goto out;
326: }
327:
328: assert(lim > 0);
329:
330: /*
331: * We have a numeric limit. If the string is shorter than that,
332: * stop and return an error. Else adjust our endpoint, length,
333: * and return the current glyph.
334: */
335:
336: if ((size_t)lim > strlen(&cp[i]))
337: return(ESCAPE_ERROR);
338:
339: rlim = lim;
340: if (sz)
341: *sz = rlim;
1.19 schwarze 342:
1.26 schwarze 343: *end = &cp[i] + lim;
344:
345: out:
346: assert(rlim >= 0 && rstart);
1.19 schwarze 347:
1.26 schwarze 348: /* Run post-processors. */
1.19 schwarze 349:
1.26 schwarze 350: switch (gly) {
351: case (ESCAPE_FONT):
352: if (1 != rlim)
353: break;
354: switch (*rstart) {
355: case ('3'):
356: /* FALLTHROUGH */
357: case ('B'):
358: gly = ESCAPE_FONTBOLD;
359: break;
360: case ('2'):
361: /* FALLTHROUGH */
362: case ('I'):
363: gly = ESCAPE_FONTITALIC;
1.16 schwarze 364: break;
1.26 schwarze 365: case ('P'):
366: gly = ESCAPE_FONTPREV;
1.16 schwarze 367: break;
1.26 schwarze 368: case ('1'):
369: /* FALLTHROUGH */
370: case ('R'):
371: gly = ESCAPE_FONTROMAN;
1.1 schwarze 372: break;
373: }
1.16 schwarze 374: break;
1.26 schwarze 375: case (ESCAPE_SPECIAL):
376: if (1 != rlim)
377: break;
378: if ('c' == *rstart)
379: gly = ESCAPE_NOSPACE;
1.16 schwarze 380: break;
1.1 schwarze 381: default:
1.16 schwarze 382: break;
1.1 schwarze 383: }
384:
1.26 schwarze 385: return(gly);
1.1 schwarze 386: }
387:
1.4 schwarze 388: void *
389: mandoc_calloc(size_t num, size_t size)
390: {
391: void *ptr;
392:
393: ptr = calloc(num, size);
394: if (NULL == ptr) {
395: perror(NULL);
1.20 schwarze 396: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 397: }
398:
399: return(ptr);
400: }
401:
402:
403: void *
404: mandoc_malloc(size_t size)
405: {
406: void *ptr;
407:
408: ptr = malloc(size);
409: if (NULL == ptr) {
410: perror(NULL);
1.20 schwarze 411: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 412: }
413:
414: return(ptr);
415: }
416:
417:
418: void *
419: mandoc_realloc(void *ptr, size_t size)
420: {
421:
422: ptr = realloc(ptr, size);
423: if (NULL == ptr) {
424: perror(NULL);
1.20 schwarze 425: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 426: }
427:
428: return(ptr);
429: }
430:
1.27 schwarze 431: char *
432: mandoc_strndup(const char *ptr, size_t sz)
433: {
434: char *p;
435:
436: p = mandoc_malloc(sz + 1);
437: memcpy(p, ptr, sz);
438: p[(int)sz] = '\0';
439: return(p);
440: }
1.4 schwarze 441:
442: char *
443: mandoc_strdup(const char *ptr)
444: {
445: char *p;
446:
447: p = strdup(ptr);
448: if (NULL == p) {
449: perror(NULL);
1.20 schwarze 450: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 451: }
452:
453: return(p);
1.21 schwarze 454: }
455:
456: /*
457: * Parse a quoted or unquoted roff-style request or macro argument.
458: * Return a pointer to the parsed argument, which is either the original
459: * pointer or advanced by one byte in case the argument is quoted.
460: * Null-terminate the argument in place.
461: * Collapse pairs of quotes inside quoted arguments.
462: * Advance the argument pointer to the next argument,
463: * or to the null byte terminating the argument line.
464: */
465: char *
1.25 schwarze 466: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 467: {
468: char *start, *cp;
469: int quoted, pairs, white;
470:
471: /* Quoting can only start with a new word. */
472: start = *cpp;
1.26 schwarze 473: quoted = 0;
1.21 schwarze 474: if ('"' == *start) {
475: quoted = 1;
476: start++;
1.26 schwarze 477: }
1.21 schwarze 478:
479: pairs = 0;
480: white = 0;
481: for (cp = start; '\0' != *cp; cp++) {
482: /* Move left after quoted quotes and escaped backslashes. */
483: if (pairs)
484: cp[-pairs] = cp[0];
485: if ('\\' == cp[0]) {
486: if ('\\' == cp[1]) {
487: /* Poor man's copy mode. */
488: pairs++;
489: cp++;
490: } else if (0 == quoted && ' ' == cp[1])
491: /* Skip escaped blanks. */
492: cp++;
493: } else if (0 == quoted) {
494: if (' ' == cp[0]) {
495: /* Unescaped blanks end unquoted args. */
496: white = 1;
497: break;
498: }
499: } else if ('"' == cp[0]) {
500: if ('"' == cp[1]) {
501: /* Quoted quotes collapse. */
502: pairs++;
503: cp++;
504: } else {
505: /* Unquoted quotes end quoted args. */
506: quoted = 2;
507: break;
508: }
509: }
510: }
511:
512: /* Quoted argument without a closing quote. */
1.25 schwarze 513: if (1 == quoted)
514: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 515:
516: /* Null-terminate this argument and move to the next one. */
517: if (pairs)
518: cp[-pairs] = '\0';
519: if ('\0' != *cp) {
520: *cp++ = '\0';
521: while (' ' == *cp)
522: cp++;
523: }
1.24 schwarze 524: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 525: *cpp = cp;
526:
1.25 schwarze 527: if ('\0' == *cp && (white || ' ' == cp[-1]))
528: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 529:
530: return(start);
1.4 schwarze 531: }
1.5 schwarze 532:
533: static int
534: a2time(time_t *t, const char *fmt, const char *p)
535: {
536: struct tm tm;
537: char *pp;
538:
539: memset(&tm, 0, sizeof(struct tm));
540:
541: pp = strptime(p, fmt, &tm);
542: if (NULL != pp && '\0' == *pp) {
543: *t = mktime(&tm);
544: return(1);
545: }
546:
547: return(0);
548: }
549:
1.22 schwarze 550: static char *
551: time2a(time_t t)
552: {
1.28 ! schwarze 553: struct tm *tm;
1.23 schwarze 554: char *buf, *p;
555: size_t ssz;
1.22 schwarze 556: int isz;
557:
1.28 ! schwarze 558: tm = localtime(&t);
1.22 schwarze 559:
1.23 schwarze 560: /*
561: * Reserve space:
562: * up to 9 characters for the month (September) + blank
563: * up to 2 characters for the day + comma + blank
564: * 4 characters for the year and a terminating '\0'
565: */
566: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
567:
1.28 ! schwarze 568: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 569: goto fail;
570: p += (int)ssz;
1.22 schwarze 571:
1.28 ! schwarze 572: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 573: goto fail;
1.22 schwarze 574: p += isz;
575:
1.28 ! schwarze 576: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 577: goto fail;
578: return(buf);
579:
580: fail:
581: free(buf);
582: return(NULL);
1.22 schwarze 583: }
584:
585: char *
1.25 schwarze 586: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 587: {
1.22 schwarze 588: char *out;
1.5 schwarze 589: time_t t;
590:
1.22 schwarze 591: if (NULL == in || '\0' == *in ||
592: 0 == strcmp(in, "$" "Mdocdate$")) {
1.25 schwarze 593: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.22 schwarze 594: time(&t);
595: }
596: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
597: !a2time(&t, "%b %d, %Y", in) &&
598: !a2time(&t, "%Y-%m-%d", in)) {
1.25 schwarze 599: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.22 schwarze 600: t = 0;
1.5 schwarze 601: }
1.22 schwarze 602: out = t ? time2a(t) : NULL;
1.23 schwarze 603: return(out ? out : mandoc_strdup(in));
1.5 schwarze 604: }
605:
1.9 schwarze 606: int
1.15 schwarze 607: mandoc_eos(const char *p, size_t sz, int enclosed)
1.9 schwarze 608: {
1.15 schwarze 609: const char *q;
1.16 schwarze 610: int found;
1.9 schwarze 611:
1.10 schwarze 612: if (0 == sz)
613: return(0);
1.9 schwarze 614:
1.11 schwarze 615: /*
616: * End-of-sentence recognition must include situations where
617: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 618: * propagate outward.
1.11 schwarze 619: */
620:
1.16 schwarze 621: found = 0;
622: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 623: switch (*q) {
1.11 schwarze 624: case ('\"'):
625: /* FALLTHROUGH */
626: case ('\''):
627: /* FALLTHROUGH */
628: case (']'):
629: /* FALLTHROUGH */
630: case (')'):
1.15 schwarze 631: if (0 == found)
632: enclosed = 1;
1.11 schwarze 633: break;
634: case ('.'):
635: /* FALLTHROUGH */
636: case ('!'):
637: /* FALLTHROUGH */
638: case ('?'):
1.15 schwarze 639: found = 1;
640: break;
1.11 schwarze 641: default:
1.20 schwarze 642: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 643: }
1.9 schwarze 644: }
645:
1.15 schwarze 646: return(found && !enclosed);
1.24 schwarze 647: }
648:
649: /*
1.25 schwarze 650: * Find out whether a line is a macro line or not. If it is, adjust the
651: * current position and return one; if it isn't, return zero and don't
652: * change the current position.
1.24 schwarze 653: */
1.25 schwarze 654: int
655: mandoc_getcontrol(const char *cp, int *ppos)
1.24 schwarze 656: {
1.25 schwarze 657: int pos;
1.24 schwarze 658:
1.25 schwarze 659: pos = *ppos;
1.24 schwarze 660:
1.25 schwarze 661: if ('\\' == cp[pos] && '.' == cp[pos + 1])
662: pos += 2;
663: else if ('.' == cp[pos] || '\'' == cp[pos])
664: pos++;
665: else
666: return(0);
1.24 schwarze 667:
1.25 schwarze 668: while (' ' == cp[pos] || '\t' == cp[pos])
669: pos++;
1.24 schwarze 670:
1.25 schwarze 671: *ppos = pos;
672: return(1);
1.9 schwarze 673: }
1.26 schwarze 674:
675: /*
676: * Convert a string to a long that may not be <0.
677: * If the string is invalid, or is less than 0, return -1.
678: */
679: int
1.27 schwarze 680: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 681: {
682: char buf[32];
683: char *ep;
684: long v;
685:
686: if (sz > 31)
687: return(-1);
688:
689: memcpy(buf, p, sz);
690: buf[(int)sz] = '\0';
691:
692: errno = 0;
693: v = strtol(buf, &ep, base);
694:
695: if (buf[0] == '\0' || *ep != '\0')
696: return(-1);
697:
1.27 schwarze 698: if (v > INT_MAX)
699: v = INT_MAX;
700: if (v < INT_MIN)
701: v = INT_MIN;
1.26 schwarze 702:
703: return((int)v);
704: }