Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.42
1.42 ! schwarze 1: /* $Id: mandoc.c,v 1.41 2013/12/25 22:45:16 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.37 schwarze 4: * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.1 schwarze 30: #include "libmandoc.h"
31:
1.22 schwarze 32: #define DATESIZE 32
33:
1.14 schwarze 34: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 35: static char *time2a(time_t);
1.5 schwarze 36:
1.26 schwarze 37:
38: enum mandoc_esc
1.38 schwarze 39: mandoc_escape(const char const **end, const char const **start, int *sz)
1.26 schwarze 40: {
1.34 schwarze 41: const char *local_start;
42: int local_sz;
43: char term;
1.26 schwarze 44: enum mandoc_esc gly;
45:
1.34 schwarze 46: /*
47: * When the caller doesn't provide return storage,
48: * use local storage.
49: */
50:
51: if (NULL == start)
52: start = &local_start;
53: if (NULL == sz)
54: sz = &local_sz;
55:
56: /*
57: * Beyond the backslash, at least one input character
58: * is part of the escape sequence. With one exception
59: * (see below), that character won't be returned.
60: */
61:
1.26 schwarze 62: gly = ESCAPE_ERROR;
1.34 schwarze 63: *start = ++*end;
64: *sz = 0;
1.33 schwarze 65: term = '\0';
1.26 schwarze 66:
1.34 schwarze 67: switch ((*start)[-1]) {
1.26 schwarze 68: /*
69: * First the glyphs. There are several different forms of
70: * these, but each eventually returns a substring of the glyph
71: * name.
72: */
73: case ('('):
74: gly = ESCAPE_SPECIAL;
1.34 schwarze 75: *sz = 2;
1.26 schwarze 76: break;
77: case ('['):
78: gly = ESCAPE_SPECIAL;
79: /*
80: * Unicode escapes are defined in groff as \[uXXXX] to
81: * \[u10FFFF], where the contained value must be a valid
82: * Unicode codepoint. Here, however, only check whether
83: * it's not a zero-width escape.
84: */
1.34 schwarze 85: if ('u' == (*start)[0] && ']' != (*start)[1])
1.26 schwarze 86: gly = ESCAPE_UNICODE;
87: term = ']';
88: break;
89: case ('C'):
1.34 schwarze 90: if ('\'' != **start)
1.26 schwarze 91: return(ESCAPE_ERROR);
1.34 schwarze 92: *start = ++*end;
1.39 schwarze 93: if ('u' == (*start)[0] && '\'' != (*start)[1])
94: gly = ESCAPE_UNICODE;
95: else
96: gly = ESCAPE_SPECIAL;
1.26 schwarze 97: term = '\'';
98: break;
1.41 schwarze 99:
100: /*
101: * Escapes taking no arguments at all.
102: */
103: case ('d'):
104: /* FALLTHROUGH */
105: case ('u'):
106: return(ESCAPE_IGNORE);
1.32 schwarze 107:
108: /*
109: * The \z escape is supposed to output the following
110: * character without advancing the cursor position.
111: * Since we are mostly dealing with terminal mode,
112: * let us just skip the next character.
113: */
114: case ('z'):
115: return(ESCAPE_SKIPCHAR);
1.1 schwarze 116:
1.26 schwarze 117: /*
118: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
119: * 'X' is the trigger. These have opaque sub-strings.
120: */
121: case ('F'):
1.16 schwarze 122: /* FALLTHROUGH */
1.26 schwarze 123: case ('g'):
1.16 schwarze 124: /* FALLTHROUGH */
1.26 schwarze 125: case ('k'):
1.1 schwarze 126: /* FALLTHROUGH */
1.26 schwarze 127: case ('M'):
1.14 schwarze 128: /* FALLTHROUGH */
1.26 schwarze 129: case ('m'):
1.1 schwarze 130: /* FALLTHROUGH */
1.26 schwarze 131: case ('n'):
1.1 schwarze 132: /* FALLTHROUGH */
1.26 schwarze 133: case ('V'):
1.1 schwarze 134: /* FALLTHROUGH */
1.26 schwarze 135: case ('Y'):
1.29 schwarze 136: gly = ESCAPE_IGNORE;
1.1 schwarze 137: /* FALLTHROUGH */
1.26 schwarze 138: case ('f'):
139: if (ESCAPE_ERROR == gly)
140: gly = ESCAPE_FONT;
1.34 schwarze 141: switch (**start) {
1.26 schwarze 142: case ('('):
1.34 schwarze 143: *start = ++*end;
144: *sz = 2;
1.26 schwarze 145: break;
146: case ('['):
1.34 schwarze 147: *start = ++*end;
1.26 schwarze 148: term = ']';
149: break;
150: default:
1.34 schwarze 151: *sz = 1;
1.26 schwarze 152: break;
153: }
154: break;
155:
156: /*
157: * These escapes are of the form \X'Y', where 'X' is the trigger
158: * and 'Y' is any string. These have opaque sub-strings.
159: */
160: case ('A'):
1.13 schwarze 161: /* FALLTHROUGH */
1.26 schwarze 162: case ('b'):
1.1 schwarze 163: /* FALLTHROUGH */
1.42 ! schwarze 164: case ('B'):
! 165: /* FALLTHROUGH */
1.16 schwarze 166: case ('D'):
1.1 schwarze 167: /* FALLTHROUGH */
1.26 schwarze 168: case ('o'):
1.1 schwarze 169: /* FALLTHROUGH */
1.26 schwarze 170: case ('R'):
1.1 schwarze 171: /* FALLTHROUGH */
1.42 ! schwarze 172: case ('w'):
! 173: /* FALLTHROUGH */
1.26 schwarze 174: case ('X'):
1.1 schwarze 175: /* FALLTHROUGH */
1.26 schwarze 176: case ('Z'):
1.34 schwarze 177: if ('\'' != **start)
1.26 schwarze 178: return(ESCAPE_ERROR);
179: gly = ESCAPE_IGNORE;
1.34 schwarze 180: *start = ++*end;
1.16 schwarze 181: term = '\'';
182: break;
1.26 schwarze 183:
184: /*
185: * These escapes are of the form \X'N', where 'X' is the trigger
186: * and 'N' resolves to a numerical expression.
187: */
1.17 schwarze 188: case ('h'):
189: /* FALLTHROUGH */
1.26 schwarze 190: case ('H'):
191: /* FALLTHROUGH */
192: case ('L'):
193: /* FALLTHROUGH */
194: case ('l'):
195: /* FALLTHROUGH */
196: case ('S'):
197: /* FALLTHROUGH */
1.17 schwarze 198: case ('v'):
199: /* FALLTHROUGH */
1.26 schwarze 200: case ('x'):
1.34 schwarze 201: if ('\'' != **start)
202: return(ESCAPE_ERROR);
1.42 ! schwarze 203: gly = ESCAPE_IGNORE;
1.34 schwarze 204: *start = ++*end;
1.33 schwarze 205: term = '\'';
1.26 schwarze 206: break;
1.29 schwarze 207:
208: /*
209: * Special handling for the numbered character escape.
210: * XXX Do any other escapes need similar handling?
211: */
212: case ('N'):
1.34 schwarze 213: if ('\0' == **start)
1.29 schwarze 214: return(ESCAPE_ERROR);
1.34 schwarze 215: (*end)++;
216: if (isdigit((unsigned char)**start)) {
217: *sz = 1;
1.29 schwarze 218: return(ESCAPE_IGNORE);
1.34 schwarze 219: }
220: (*start)++;
1.29 schwarze 221: while (isdigit((unsigned char)**end))
222: (*end)++;
1.34 schwarze 223: *sz = *end - *start;
1.29 schwarze 224: if ('\0' != **end)
225: (*end)++;
226: return(ESCAPE_NUMBERED);
1.26 schwarze 227:
228: /*
229: * Sizes get a special category of their own.
230: */
1.6 schwarze 231: case ('s'):
1.26 schwarze 232: gly = ESCAPE_IGNORE;
1.17 schwarze 233:
1.26 schwarze 234: /* See +/- counts as a sign. */
1.34 schwarze 235: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
236: (*end)++;
1.6 schwarze 237:
1.34 schwarze 238: switch (**end) {
1.16 schwarze 239: case ('('):
1.34 schwarze 240: *start = ++*end;
241: *sz = 2;
1.16 schwarze 242: break;
243: case ('['):
1.34 schwarze 244: *start = ++*end;
1.33 schwarze 245: term = ']';
1.16 schwarze 246: break;
247: case ('\''):
1.34 schwarze 248: *start = ++*end;
1.33 schwarze 249: term = '\'';
1.16 schwarze 250: break;
251: default:
1.34 schwarze 252: *sz = 1;
1.16 schwarze 253: break;
1.6 schwarze 254: }
255:
1.26 schwarze 256: break;
257:
258: /*
259: * Anything else is assumed to be a glyph.
1.34 schwarze 260: * In this case, pass back the character after the backslash.
1.26 schwarze 261: */
262: default:
263: gly = ESCAPE_SPECIAL;
1.34 schwarze 264: *start = --*end;
265: *sz = 1;
1.26 schwarze 266: break;
267: }
268:
269: assert(ESCAPE_ERROR != gly);
270:
271: /*
1.33 schwarze 272: * Read up to the terminating character,
273: * paying attention to nested escapes.
1.26 schwarze 274: */
275:
276: if ('\0' != term) {
1.33 schwarze 277: while (**end != term) {
278: switch (**end) {
279: case ('\0'):
280: return(ESCAPE_ERROR);
281: case ('\\'):
282: (*end)++;
283: if (ESCAPE_ERROR ==
284: mandoc_escape(end, NULL, NULL))
285: return(ESCAPE_ERROR);
286: break;
287: default:
288: (*end)++;
289: break;
290: }
291: }
1.34 schwarze 292: *sz = (*end)++ - *start;
1.33 schwarze 293: } else {
1.34 schwarze 294: assert(*sz > 0);
295: if ((size_t)*sz > strlen(*start))
1.26 schwarze 296: return(ESCAPE_ERROR);
1.34 schwarze 297: *end += *sz;
1.26 schwarze 298: }
1.19 schwarze 299:
1.26 schwarze 300: /* Run post-processors. */
1.19 schwarze 301:
1.26 schwarze 302: switch (gly) {
303: case (ESCAPE_FONT):
1.37 schwarze 304: if (2 == *sz) {
305: if ('C' == **start) {
306: /*
307: * Treat constant-width font modes
308: * just like regular font modes.
309: */
310: (*start)++;
311: (*sz)--;
312: } else {
313: if ('B' == (*start)[0] && 'I' == (*start)[1])
314: gly = ESCAPE_FONTBI;
315: break;
316: }
1.34 schwarze 317: } else if (1 != *sz)
1.26 schwarze 318: break;
1.30 schwarze 319:
1.34 schwarze 320: switch (**start) {
1.26 schwarze 321: case ('3'):
322: /* FALLTHROUGH */
323: case ('B'):
324: gly = ESCAPE_FONTBOLD;
325: break;
326: case ('2'):
327: /* FALLTHROUGH */
328: case ('I'):
329: gly = ESCAPE_FONTITALIC;
1.16 schwarze 330: break;
1.26 schwarze 331: case ('P'):
332: gly = ESCAPE_FONTPREV;
1.16 schwarze 333: break;
1.26 schwarze 334: case ('1'):
335: /* FALLTHROUGH */
336: case ('R'):
337: gly = ESCAPE_FONTROMAN;
1.1 schwarze 338: break;
339: }
1.16 schwarze 340: break;
1.26 schwarze 341: case (ESCAPE_SPECIAL):
1.34 schwarze 342: if (1 == *sz && 'c' == **start)
1.26 schwarze 343: gly = ESCAPE_NOSPACE;
1.16 schwarze 344: break;
1.1 schwarze 345: default:
1.16 schwarze 346: break;
1.1 schwarze 347: }
348:
1.26 schwarze 349: return(gly);
1.1 schwarze 350: }
351:
1.4 schwarze 352: void *
353: mandoc_calloc(size_t num, size_t size)
354: {
355: void *ptr;
356:
357: ptr = calloc(num, size);
358: if (NULL == ptr) {
359: perror(NULL);
1.20 schwarze 360: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 361: }
362:
363: return(ptr);
364: }
365:
366:
367: void *
368: mandoc_malloc(size_t size)
369: {
370: void *ptr;
371:
372: ptr = malloc(size);
373: if (NULL == ptr) {
374: perror(NULL);
1.20 schwarze 375: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 376: }
377:
378: return(ptr);
379: }
380:
381:
382: void *
383: mandoc_realloc(void *ptr, size_t size)
384: {
385:
386: ptr = realloc(ptr, size);
387: if (NULL == ptr) {
388: perror(NULL);
1.20 schwarze 389: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 390: }
391:
392: return(ptr);
393: }
394:
1.27 schwarze 395: char *
396: mandoc_strndup(const char *ptr, size_t sz)
397: {
398: char *p;
399:
400: p = mandoc_malloc(sz + 1);
401: memcpy(p, ptr, sz);
402: p[(int)sz] = '\0';
403: return(p);
404: }
1.4 schwarze 405:
406: char *
407: mandoc_strdup(const char *ptr)
408: {
409: char *p;
410:
411: p = strdup(ptr);
412: if (NULL == p) {
413: perror(NULL);
1.20 schwarze 414: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 415: }
416:
417: return(p);
1.21 schwarze 418: }
419:
420: /*
421: * Parse a quoted or unquoted roff-style request or macro argument.
422: * Return a pointer to the parsed argument, which is either the original
423: * pointer or advanced by one byte in case the argument is quoted.
1.40 schwarze 424: * NUL-terminate the argument in place.
1.21 schwarze 425: * Collapse pairs of quotes inside quoted arguments.
426: * Advance the argument pointer to the next argument,
1.40 schwarze 427: * or to the NUL byte terminating the argument line.
1.21 schwarze 428: */
429: char *
1.25 schwarze 430: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 431: {
432: char *start, *cp;
433: int quoted, pairs, white;
434:
435: /* Quoting can only start with a new word. */
436: start = *cpp;
1.26 schwarze 437: quoted = 0;
1.21 schwarze 438: if ('"' == *start) {
439: quoted = 1;
440: start++;
1.26 schwarze 441: }
1.21 schwarze 442:
443: pairs = 0;
444: white = 0;
445: for (cp = start; '\0' != *cp; cp++) {
1.36 schwarze 446:
447: /*
448: * Move the following text left
449: * after quoted quotes and after "\\" and "\t".
450: */
1.21 schwarze 451: if (pairs)
452: cp[-pairs] = cp[0];
1.36 schwarze 453:
1.21 schwarze 454: if ('\\' == cp[0]) {
1.36 schwarze 455: /*
456: * In copy mode, translate double to single
457: * backslashes and backslash-t to literal tabs.
458: */
459: switch (cp[1]) {
460: case ('t'):
461: cp[0] = '\t';
462: /* FALLTHROUGH */
463: case ('\\'):
1.21 schwarze 464: pairs++;
465: cp++;
1.36 schwarze 466: break;
467: case (' '):
1.21 schwarze 468: /* Skip escaped blanks. */
1.36 schwarze 469: if (0 == quoted)
470: cp++;
471: break;
472: default:
473: break;
474: }
1.21 schwarze 475: } else if (0 == quoted) {
476: if (' ' == cp[0]) {
477: /* Unescaped blanks end unquoted args. */
478: white = 1;
479: break;
480: }
481: } else if ('"' == cp[0]) {
482: if ('"' == cp[1]) {
483: /* Quoted quotes collapse. */
484: pairs++;
485: cp++;
486: } else {
487: /* Unquoted quotes end quoted args. */
488: quoted = 2;
489: break;
490: }
491: }
492: }
493:
494: /* Quoted argument without a closing quote. */
1.25 schwarze 495: if (1 == quoted)
496: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 497:
1.40 schwarze 498: /* NUL-terminate this argument and move to the next one. */
1.21 schwarze 499: if (pairs)
500: cp[-pairs] = '\0';
501: if ('\0' != *cp) {
502: *cp++ = '\0';
503: while (' ' == *cp)
504: cp++;
505: }
1.24 schwarze 506: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 507: *cpp = cp;
508:
1.25 schwarze 509: if ('\0' == *cp && (white || ' ' == cp[-1]))
510: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 511:
512: return(start);
1.4 schwarze 513: }
1.5 schwarze 514:
515: static int
516: a2time(time_t *t, const char *fmt, const char *p)
517: {
518: struct tm tm;
519: char *pp;
520:
521: memset(&tm, 0, sizeof(struct tm));
522:
523: pp = strptime(p, fmt, &tm);
524: if (NULL != pp && '\0' == *pp) {
525: *t = mktime(&tm);
526: return(1);
527: }
528:
529: return(0);
530: }
531:
1.22 schwarze 532: static char *
533: time2a(time_t t)
534: {
1.28 schwarze 535: struct tm *tm;
1.23 schwarze 536: char *buf, *p;
537: size_t ssz;
1.22 schwarze 538: int isz;
539:
1.28 schwarze 540: tm = localtime(&t);
1.22 schwarze 541:
1.23 schwarze 542: /*
543: * Reserve space:
544: * up to 9 characters for the month (September) + blank
545: * up to 2 characters for the day + comma + blank
546: * 4 characters for the year and a terminating '\0'
547: */
548: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
549:
1.28 schwarze 550: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 551: goto fail;
552: p += (int)ssz;
1.22 schwarze 553:
1.28 schwarze 554: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 555: goto fail;
1.22 schwarze 556: p += isz;
557:
1.28 schwarze 558: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 559: goto fail;
560: return(buf);
561:
562: fail:
563: free(buf);
564: return(NULL);
1.22 schwarze 565: }
566:
567: char *
1.25 schwarze 568: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 569: {
1.22 schwarze 570: char *out;
1.5 schwarze 571: time_t t;
572:
1.22 schwarze 573: if (NULL == in || '\0' == *in ||
574: 0 == strcmp(in, "$" "Mdocdate$")) {
1.25 schwarze 575: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.22 schwarze 576: time(&t);
577: }
1.31 schwarze 578: else if (a2time(&t, "%Y-%m-%d", in))
579: t = 0;
1.22 schwarze 580: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 581: !a2time(&t, "%b %d, %Y", in)) {
1.25 schwarze 582: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.22 schwarze 583: t = 0;
1.5 schwarze 584: }
1.22 schwarze 585: out = t ? time2a(t) : NULL;
1.23 schwarze 586: return(out ? out : mandoc_strdup(in));
1.5 schwarze 587: }
588:
1.9 schwarze 589: int
1.15 schwarze 590: mandoc_eos(const char *p, size_t sz, int enclosed)
1.9 schwarze 591: {
1.15 schwarze 592: const char *q;
1.16 schwarze 593: int found;
1.9 schwarze 594:
1.10 schwarze 595: if (0 == sz)
596: return(0);
1.9 schwarze 597:
1.11 schwarze 598: /*
599: * End-of-sentence recognition must include situations where
600: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 601: * propagate outward.
1.11 schwarze 602: */
603:
1.16 schwarze 604: found = 0;
605: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 606: switch (*q) {
1.11 schwarze 607: case ('\"'):
608: /* FALLTHROUGH */
609: case ('\''):
610: /* FALLTHROUGH */
611: case (']'):
612: /* FALLTHROUGH */
613: case (')'):
1.15 schwarze 614: if (0 == found)
615: enclosed = 1;
1.11 schwarze 616: break;
617: case ('.'):
618: /* FALLTHROUGH */
619: case ('!'):
620: /* FALLTHROUGH */
621: case ('?'):
1.15 schwarze 622: found = 1;
623: break;
1.11 schwarze 624: default:
1.20 schwarze 625: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 626: }
1.9 schwarze 627: }
628:
1.15 schwarze 629: return(found && !enclosed);
1.9 schwarze 630: }
1.26 schwarze 631:
632: /*
633: * Convert a string to a long that may not be <0.
634: * If the string is invalid, or is less than 0, return -1.
635: */
636: int
1.27 schwarze 637: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 638: {
639: char buf[32];
640: char *ep;
641: long v;
642:
643: if (sz > 31)
644: return(-1);
645:
646: memcpy(buf, p, sz);
647: buf[(int)sz] = '\0';
648:
649: errno = 0;
650: v = strtol(buf, &ep, base);
651:
652: if (buf[0] == '\0' || *ep != '\0')
653: return(-1);
654:
1.27 schwarze 655: if (v > INT_MAX)
656: v = INT_MAX;
657: if (v < INT_MIN)
658: v = INT_MIN;
1.26 schwarze 659:
660: return((int)v);
661: }