Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.82
1.82 ! schwarze 1: /* $OpenBSD: mandoc.c,v 1.81 2018/12/18 21:58:41 schwarze Exp $ */
1.1 schwarze 2: /*
1.58 schwarze 3: * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
1.72 schwarze 4: * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.69 schwarze 29: #include "mandoc_aux.h"
1.14 schwarze 30: #include "mandoc.h"
1.69 schwarze 31: #include "roff.h"
1.1 schwarze 32: #include "libmandoc.h"
1.82 ! schwarze 33: #include "roff_int.h"
1.22 schwarze 34:
1.14 schwarze 35: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 36: static char *time2a(time_t);
1.5 schwarze 37:
1.26 schwarze 38:
39: enum mandoc_esc
1.80 schwarze 40: mandoc_font(const char *cp, int sz)
41: {
42: switch (sz) {
43: case 0:
44: return ESCAPE_FONTPREV;
45: case 1:
46: switch (cp[0]) {
47: case 'B':
48: case '3':
49: return ESCAPE_FONTBOLD;
50: case 'I':
51: case '2':
52: return ESCAPE_FONTITALIC;
53: case 'P':
54: return ESCAPE_FONTPREV;
55: case 'R':
56: case '1':
57: return ESCAPE_FONTROMAN;
58: case '4':
59: return ESCAPE_FONTBI;
60: default:
61: return ESCAPE_ERROR;
62: }
63: case 2:
64: switch (cp[0]) {
65: case 'B':
66: switch (cp[1]) {
67: case 'I':
68: return ESCAPE_FONTBI;
69: default:
70: return ESCAPE_ERROR;
71: }
72: case 'C':
73: switch (cp[1]) {
74: case 'B':
75: return ESCAPE_FONTBOLD;
76: case 'I':
77: return ESCAPE_FONTITALIC;
78: case 'R':
79: case 'W':
80: return ESCAPE_FONTCW;
81: default:
82: return ESCAPE_ERROR;
83: }
84: default:
85: return ESCAPE_ERROR;
86: }
87: default:
88: return ESCAPE_ERROR;
89: }
90: }
91:
92: enum mandoc_esc
1.44 schwarze 93: mandoc_escape(const char **end, const char **start, int *sz)
1.26 schwarze 94: {
1.34 schwarze 95: const char *local_start;
1.73 schwarze 96: int local_sz, c, i;
1.34 schwarze 97: char term;
1.48 schwarze 98: enum mandoc_esc gly;
1.26 schwarze 99:
1.34 schwarze 100: /*
101: * When the caller doesn't provide return storage,
102: * use local storage.
103: */
104:
105: if (NULL == start)
106: start = &local_start;
107: if (NULL == sz)
108: sz = &local_sz;
109:
110: /*
1.79 schwarze 111: * Treat "\E" just like "\";
112: * it only makes a difference in copy mode.
113: */
114:
115: if (**end == 'E')
116: ++*end;
117:
118: /*
1.34 schwarze 119: * Beyond the backslash, at least one input character
120: * is part of the escape sequence. With one exception
121: * (see below), that character won't be returned.
122: */
123:
1.26 schwarze 124: gly = ESCAPE_ERROR;
1.34 schwarze 125: *start = ++*end;
126: *sz = 0;
1.33 schwarze 127: term = '\0';
1.26 schwarze 128:
1.34 schwarze 129: switch ((*start)[-1]) {
1.26 schwarze 130: /*
131: * First the glyphs. There are several different forms of
132: * these, but each eventually returns a substring of the glyph
133: * name.
134: */
1.48 schwarze 135: case '(':
1.26 schwarze 136: gly = ESCAPE_SPECIAL;
1.34 schwarze 137: *sz = 2;
1.26 schwarze 138: break;
1.48 schwarze 139: case '[':
1.79 schwarze 140: if (**start == ' ') {
141: ++*end;
142: return ESCAPE_ERROR;
143: }
1.26 schwarze 144: gly = ESCAPE_SPECIAL;
145: term = ']';
146: break;
1.48 schwarze 147: case 'C':
1.34 schwarze 148: if ('\'' != **start)
1.62 schwarze 149: return ESCAPE_ERROR;
1.34 schwarze 150: *start = ++*end;
1.54 schwarze 151: gly = ESCAPE_SPECIAL;
1.26 schwarze 152: term = '\'';
153: break;
1.41 schwarze 154:
155: /*
156: * Escapes taking no arguments at all.
157: */
1.79 schwarze 158: case '!':
159: case '?':
160: return ESCAPE_UNSUPP;
161: case '%':
162: case '&':
163: case ')':
164: case ',':
165: case '/':
166: case '^':
167: case 'a':
1.48 schwarze 168: case 'd':
1.79 schwarze 169: case 'r':
170: case 't':
1.48 schwarze 171: case 'u':
1.79 schwarze 172: case '{':
173: case '|':
174: case '}':
1.62 schwarze 175: return ESCAPE_IGNORE;
1.79 schwarze 176: case 'c':
177: return ESCAPE_NOSPACE;
1.70 schwarze 178: case 'p':
179: return ESCAPE_BREAK;
1.32 schwarze 180:
181: /*
182: * The \z escape is supposed to output the following
1.48 schwarze 183: * character without advancing the cursor position.
1.32 schwarze 184: * Since we are mostly dealing with terminal mode,
185: * let us just skip the next character.
186: */
1.48 schwarze 187: case 'z':
1.62 schwarze 188: return ESCAPE_SKIPCHAR;
1.1 schwarze 189:
1.26 schwarze 190: /*
191: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
192: * 'X' is the trigger. These have opaque sub-strings.
193: */
1.48 schwarze 194: case 'F':
1.79 schwarze 195: case 'f':
1.48 schwarze 196: case 'g':
197: case 'k':
198: case 'M':
199: case 'm':
200: case 'n':
1.79 schwarze 201: case 'O':
1.48 schwarze 202: case 'V':
203: case 'Y':
1.79 schwarze 204: gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
1.34 schwarze 205: switch (**start) {
1.48 schwarze 206: case '(':
1.79 schwarze 207: if ((*start)[-1] == 'O')
208: gly = ESCAPE_ERROR;
1.34 schwarze 209: *start = ++*end;
210: *sz = 2;
1.26 schwarze 211: break;
1.48 schwarze 212: case '[':
1.79 schwarze 213: if ((*start)[-1] == 'O')
214: gly = (*start)[1] == '5' ?
215: ESCAPE_UNSUPP : ESCAPE_ERROR;
1.34 schwarze 216: *start = ++*end;
1.26 schwarze 217: term = ']';
218: break;
219: default:
1.79 schwarze 220: if ((*start)[-1] == 'O') {
221: switch (**start) {
222: case '0':
223: gly = ESCAPE_UNSUPP;
224: break;
225: case '1':
226: case '2':
227: case '3':
228: case '4':
229: break;
230: default:
231: gly = ESCAPE_ERROR;
232: break;
233: }
234: }
1.34 schwarze 235: *sz = 1;
1.26 schwarze 236: break;
237: }
1.74 schwarze 238: break;
239: case '*':
240: if (strncmp(*start, "(.T", 3) != 0)
241: abort();
242: gly = ESCAPE_DEVICE;
243: *start = ++*end;
244: *sz = 2;
1.26 schwarze 245: break;
246:
247: /*
248: * These escapes are of the form \X'Y', where 'X' is the trigger
249: * and 'Y' is any string. These have opaque sub-strings.
1.47 schwarze 250: * The \B and \w escapes are handled in roff.c, roff_res().
1.26 schwarze 251: */
1.48 schwarze 252: case 'A':
253: case 'b':
254: case 'D':
255: case 'R':
256: case 'X':
257: case 'Z':
1.59 schwarze 258: gly = ESCAPE_IGNORE;
259: /* FALLTHROUGH */
260: case 'o':
261: if (**start == '\0')
1.62 schwarze 262: return ESCAPE_ERROR;
1.59 schwarze 263: if (gly == ESCAPE_ERROR)
264: gly = ESCAPE_OVERSTRIKE;
1.46 schwarze 265: term = **start;
1.34 schwarze 266: *start = ++*end;
1.16 schwarze 267: break;
1.26 schwarze 268:
269: /*
270: * These escapes are of the form \X'N', where 'X' is the trigger
271: * and 'N' resolves to a numerical expression.
272: */
1.48 schwarze 273: case 'h':
274: case 'H':
275: case 'L':
276: case 'l':
277: case 'S':
278: case 'v':
279: case 'x':
1.51 schwarze 280: if (strchr(" %&()*+-./0123456789:<=>", **start)) {
1.53 schwarze 281: if ('\0' != **start)
282: ++*end;
1.62 schwarze 283: return ESCAPE_ERROR;
1.51 schwarze 284: }
1.68 schwarze 285: switch ((*start)[-1]) {
286: case 'h':
287: gly = ESCAPE_HORIZ;
288: break;
289: case 'l':
290: gly = ESCAPE_HLINE;
291: break;
292: default:
293: gly = ESCAPE_IGNORE;
294: break;
295: }
1.46 schwarze 296: term = **start;
1.34 schwarze 297: *start = ++*end;
1.26 schwarze 298: break;
1.29 schwarze 299:
300: /*
301: * Special handling for the numbered character escape.
302: * XXX Do any other escapes need similar handling?
303: */
1.48 schwarze 304: case 'N':
1.34 schwarze 305: if ('\0' == **start)
1.62 schwarze 306: return ESCAPE_ERROR;
1.34 schwarze 307: (*end)++;
308: if (isdigit((unsigned char)**start)) {
309: *sz = 1;
1.62 schwarze 310: return ESCAPE_IGNORE;
1.34 schwarze 311: }
312: (*start)++;
1.29 schwarze 313: while (isdigit((unsigned char)**end))
314: (*end)++;
1.34 schwarze 315: *sz = *end - *start;
1.29 schwarze 316: if ('\0' != **end)
317: (*end)++;
1.62 schwarze 318: return ESCAPE_NUMBERED;
1.26 schwarze 319:
1.48 schwarze 320: /*
1.26 schwarze 321: * Sizes get a special category of their own.
322: */
1.48 schwarze 323: case 's':
1.26 schwarze 324: gly = ESCAPE_IGNORE;
1.17 schwarze 325:
1.26 schwarze 326: /* See +/- counts as a sign. */
1.34 schwarze 327: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
1.58 schwarze 328: *start = ++*end;
1.6 schwarze 329:
1.34 schwarze 330: switch (**end) {
1.48 schwarze 331: case '(':
1.34 schwarze 332: *start = ++*end;
333: *sz = 2;
1.16 schwarze 334: break;
1.48 schwarze 335: case '[':
1.34 schwarze 336: *start = ++*end;
1.33 schwarze 337: term = ']';
1.16 schwarze 338: break;
1.48 schwarze 339: case '\'':
1.34 schwarze 340: *start = ++*end;
1.33 schwarze 341: term = '\'';
1.60 schwarze 342: break;
343: case '3':
344: case '2':
345: case '1':
346: *sz = (*end)[-1] == 's' &&
347: isdigit((unsigned char)(*end)[1]) ? 2 : 1;
1.16 schwarze 348: break;
349: default:
1.34 schwarze 350: *sz = 1;
1.16 schwarze 351: break;
1.6 schwarze 352: }
353:
1.26 schwarze 354: break;
355:
356: /*
1.79 schwarze 357: * Several special characters can be encoded as
358: * one-byte escape sequences without using \[].
1.26 schwarze 359: */
1.79 schwarze 360: case ' ':
361: case '\'':
362: case '-':
363: case '.':
364: case '0':
365: case ':':
366: case '_':
367: case '`':
368: case 'e':
369: case '~':
370: gly = ESCAPE_SPECIAL;
371: /* FALLTHROUGH */
1.26 schwarze 372: default:
1.79 schwarze 373: if (gly == ESCAPE_ERROR)
374: gly = ESCAPE_UNDEF;
1.34 schwarze 375: *start = --*end;
376: *sz = 1;
1.26 schwarze 377: break;
378: }
379:
380: /*
1.33 schwarze 381: * Read up to the terminating character,
382: * paying attention to nested escapes.
1.26 schwarze 383: */
384:
385: if ('\0' != term) {
1.33 schwarze 386: while (**end != term) {
387: switch (**end) {
1.48 schwarze 388: case '\0':
1.62 schwarze 389: return ESCAPE_ERROR;
1.48 schwarze 390: case '\\':
1.33 schwarze 391: (*end)++;
392: if (ESCAPE_ERROR ==
393: mandoc_escape(end, NULL, NULL))
1.62 schwarze 394: return ESCAPE_ERROR;
1.33 schwarze 395: break;
396: default:
397: (*end)++;
398: break;
399: }
400: }
1.34 schwarze 401: *sz = (*end)++ - *start;
1.79 schwarze 402:
403: /*
404: * The file chars.c only provides one common list
405: * of character names, but \[-] == \- is the only
406: * one of the characters with one-byte names that
407: * allows enclosing the name in brackets.
408: */
409: if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
410: return ESCAPE_ERROR;
1.33 schwarze 411: } else {
1.34 schwarze 412: assert(*sz > 0);
413: if ((size_t)*sz > strlen(*start))
1.62 schwarze 414: return ESCAPE_ERROR;
1.34 schwarze 415: *end += *sz;
1.26 schwarze 416: }
1.19 schwarze 417:
1.26 schwarze 418: /* Run post-processors. */
1.19 schwarze 419:
1.26 schwarze 420: switch (gly) {
1.48 schwarze 421: case ESCAPE_FONT:
1.80 schwarze 422: gly = mandoc_font(*start, *sz);
1.16 schwarze 423: break;
1.48 schwarze 424: case ESCAPE_SPECIAL:
1.73 schwarze 425: if (**start == 'c') {
426: if (*sz < 6 || *sz > 7 ||
427: strncmp(*start, "char", 4) != 0 ||
428: (int)strspn(*start + 4, "0123456789") + 4 < *sz)
429: break;
430: c = 0;
431: for (i = 4; i < *sz; i++)
432: c = 10 * c + ((*start)[i] - '0');
433: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
434: break;
435: *start += 4;
436: *sz -= 4;
437: gly = ESCAPE_NUMBERED;
438: break;
439: }
440:
1.54 schwarze 441: /*
1.55 schwarze 442: * Unicode escapes are defined in groff as \[u0000]
1.54 schwarze 443: * to \[u10FFFF], where the contained value must be
444: * a valid Unicode codepoint. Here, however, only
1.55 schwarze 445: * check the length and range.
1.54 schwarze 446: */
1.55 schwarze 447: if (**start != 'u' || *sz < 5 || *sz > 7)
448: break;
449: if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
450: break;
451: if (*sz == 6 && (*start)[1] == '0')
1.64 schwarze 452: break;
453: if (*sz == 5 && (*start)[1] == 'D' &&
454: strchr("89ABCDEF", (*start)[2]) != NULL)
1.55 schwarze 455: break;
456: if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
1.54 schwarze 457: + 1 == *sz)
458: gly = ESCAPE_UNICODE;
1.16 schwarze 459: break;
1.1 schwarze 460: default:
1.16 schwarze 461: break;
1.1 schwarze 462: }
463:
1.62 schwarze 464: return gly;
1.4 schwarze 465: }
1.5 schwarze 466:
467: static int
468: a2time(time_t *t, const char *fmt, const char *p)
469: {
470: struct tm tm;
471: char *pp;
472:
473: memset(&tm, 0, sizeof(struct tm));
474:
475: pp = strptime(p, fmt, &tm);
476: if (NULL != pp && '\0' == *pp) {
477: *t = mktime(&tm);
1.62 schwarze 478: return 1;
1.5 schwarze 479: }
480:
1.62 schwarze 481: return 0;
1.5 schwarze 482: }
483:
1.22 schwarze 484: static char *
485: time2a(time_t t)
486: {
1.28 schwarze 487: struct tm *tm;
1.23 schwarze 488: char *buf, *p;
489: size_t ssz;
1.22 schwarze 490: int isz;
491:
1.28 schwarze 492: tm = localtime(&t);
1.57 schwarze 493: if (tm == NULL)
1.62 schwarze 494: return NULL;
1.22 schwarze 495:
1.23 schwarze 496: /*
497: * Reserve space:
498: * up to 9 characters for the month (September) + blank
499: * up to 2 characters for the day + comma + blank
500: * 4 characters for the year and a terminating '\0'
501: */
1.66 schwarze 502:
1.23 schwarze 503: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
504:
1.66 schwarze 505: if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
1.23 schwarze 506: goto fail;
507: p += (int)ssz;
1.22 schwarze 508:
1.66 schwarze 509: /*
510: * The output format is just "%d" here, not "%2d" or "%02d".
511: * That's also the reason why we can't just format the
512: * date as a whole with "%B %e, %Y" or "%B %d, %Y".
513: * Besides, the present approach is less prone to buffer
514: * overflows, in case anybody should ever introduce the bug
515: * of looking at LC_TIME.
516: */
517:
518: if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
1.23 schwarze 519: goto fail;
1.22 schwarze 520: p += isz;
521:
1.66 schwarze 522: if (strftime(p, 4 + 1, "%Y", tm) == 0)
1.23 schwarze 523: goto fail;
1.62 schwarze 524: return buf;
1.23 schwarze 525:
526: fail:
527: free(buf);
1.62 schwarze 528: return NULL;
1.22 schwarze 529: }
530:
531: char *
1.69 schwarze 532: mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
1.5 schwarze 533: {
1.71 schwarze 534: char *cp;
1.5 schwarze 535: time_t t;
536:
1.66 schwarze 537: /* No date specified: use today's date. */
538:
539: if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
1.77 schwarze 540: mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL);
1.66 schwarze 541: return time2a(time(NULL));
1.22 schwarze 542: }
1.66 schwarze 543:
544: /* Valid mdoc(7) date format. */
545:
546: if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
1.71 schwarze 547: a2time(&t, "%b %d, %Y", in)) {
548: cp = time2a(t);
549: if (t > time(NULL) + 86400)
1.77 schwarze 550: mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp);
1.72 schwarze 551: else if (*in != '$' && strcmp(in, cp) != 0)
1.77 schwarze 552: mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp);
1.71 schwarze 553: return cp;
554: }
1.66 schwarze 555:
1.69 schwarze 556: /* In man(7), do not warn about the legacy format. */
1.66 schwarze 557:
1.69 schwarze 558: if (a2time(&t, "%Y-%m-%d", in) == 0)
1.77 schwarze 559: mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in);
1.71 schwarze 560: else if (t > time(NULL) + 86400)
1.77 schwarze 561: mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in);
1.82 ! schwarze 562: else if (man->meta.macroset == MACROSET_MDOC)
1.77 schwarze 563: mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in);
1.66 schwarze 564:
565: /* Use any non-mdoc(7) date verbatim. */
566:
567: return mandoc_strdup(in);
1.5 schwarze 568: }
569:
1.9 schwarze 570: int
1.43 schwarze 571: mandoc_eos(const char *p, size_t sz)
1.9 schwarze 572: {
1.43 schwarze 573: const char *q;
574: int enclosed, found;
1.9 schwarze 575:
1.10 schwarze 576: if (0 == sz)
1.62 schwarze 577: return 0;
1.9 schwarze 578:
1.11 schwarze 579: /*
580: * End-of-sentence recognition must include situations where
581: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 582: * propagate outward.
1.11 schwarze 583: */
584:
1.43 schwarze 585: enclosed = found = 0;
1.16 schwarze 586: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 587: switch (*q) {
1.48 schwarze 588: case '\"':
589: case '\'':
590: case ']':
591: case ')':
1.15 schwarze 592: if (0 == found)
593: enclosed = 1;
1.11 schwarze 594: break;
1.48 schwarze 595: case '.':
596: case '!':
597: case '?':
1.15 schwarze 598: found = 1;
599: break;
1.11 schwarze 600: default:
1.62 schwarze 601: return found &&
602: (!enclosed || isalnum((unsigned char)*q));
1.11 schwarze 603: }
1.9 schwarze 604: }
605:
1.62 schwarze 606: return found && !enclosed;
1.9 schwarze 607: }
1.26 schwarze 608:
609: /*
610: * Convert a string to a long that may not be <0.
611: * If the string is invalid, or is less than 0, return -1.
612: */
613: int
1.27 schwarze 614: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 615: {
616: char buf[32];
617: char *ep;
618: long v;
619:
620: if (sz > 31)
1.62 schwarze 621: return -1;
1.26 schwarze 622:
623: memcpy(buf, p, sz);
624: buf[(int)sz] = '\0';
625:
626: errno = 0;
627: v = strtol(buf, &ep, base);
628:
629: if (buf[0] == '\0' || *ep != '\0')
1.62 schwarze 630: return -1;
1.26 schwarze 631:
1.27 schwarze 632: if (v > INT_MAX)
633: v = INT_MAX;
634: if (v < INT_MIN)
635: v = INT_MIN;
1.26 schwarze 636:
1.62 schwarze 637: return (int)v;
1.26 schwarze 638: }