Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.64
1.64 ! schwarze 1: /* $OpenBSD: mandoc.c,v 1.63 2015/10/12 00:07:27 schwarze Exp $ */
1.1 schwarze 2: /*
1.58 schwarze 3: * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.45 schwarze 30: #include "mandoc_aux.h"
1.1 schwarze 31: #include "libmandoc.h"
32:
1.22 schwarze 33: #define DATESIZE 32
34:
1.14 schwarze 35: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 36: static char *time2a(time_t);
1.5 schwarze 37:
1.26 schwarze 38:
39: enum mandoc_esc
1.44 schwarze 40: mandoc_escape(const char **end, const char **start, int *sz)
1.26 schwarze 41: {
1.34 schwarze 42: const char *local_start;
43: int local_sz;
44: char term;
1.48 schwarze 45: enum mandoc_esc gly;
1.26 schwarze 46:
1.34 schwarze 47: /*
48: * When the caller doesn't provide return storage,
49: * use local storage.
50: */
51:
52: if (NULL == start)
53: start = &local_start;
54: if (NULL == sz)
55: sz = &local_sz;
56:
57: /*
58: * Beyond the backslash, at least one input character
59: * is part of the escape sequence. With one exception
60: * (see below), that character won't be returned.
61: */
62:
1.26 schwarze 63: gly = ESCAPE_ERROR;
1.34 schwarze 64: *start = ++*end;
65: *sz = 0;
1.33 schwarze 66: term = '\0';
1.26 schwarze 67:
1.34 schwarze 68: switch ((*start)[-1]) {
1.26 schwarze 69: /*
70: * First the glyphs. There are several different forms of
71: * these, but each eventually returns a substring of the glyph
72: * name.
73: */
1.48 schwarze 74: case '(':
1.26 schwarze 75: gly = ESCAPE_SPECIAL;
1.34 schwarze 76: *sz = 2;
1.26 schwarze 77: break;
1.48 schwarze 78: case '[':
1.26 schwarze 79: gly = ESCAPE_SPECIAL;
80: term = ']';
81: break;
1.48 schwarze 82: case 'C':
1.34 schwarze 83: if ('\'' != **start)
1.62 schwarze 84: return ESCAPE_ERROR;
1.34 schwarze 85: *start = ++*end;
1.54 schwarze 86: gly = ESCAPE_SPECIAL;
1.26 schwarze 87: term = '\'';
88: break;
1.41 schwarze 89:
90: /*
91: * Escapes taking no arguments at all.
92: */
1.48 schwarze 93: case 'd':
94: case 'u':
1.61 schwarze 95: case ',':
96: case '/':
1.62 schwarze 97: return ESCAPE_IGNORE;
1.32 schwarze 98:
99: /*
100: * The \z escape is supposed to output the following
1.48 schwarze 101: * character without advancing the cursor position.
1.32 schwarze 102: * Since we are mostly dealing with terminal mode,
103: * let us just skip the next character.
104: */
1.48 schwarze 105: case 'z':
1.62 schwarze 106: return ESCAPE_SKIPCHAR;
1.1 schwarze 107:
1.26 schwarze 108: /*
109: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
110: * 'X' is the trigger. These have opaque sub-strings.
111: */
1.48 schwarze 112: case 'F':
113: case 'g':
114: case 'k':
115: case 'M':
116: case 'm':
117: case 'n':
118: case 'V':
119: case 'Y':
1.29 schwarze 120: gly = ESCAPE_IGNORE;
1.1 schwarze 121: /* FALLTHROUGH */
1.48 schwarze 122: case 'f':
1.26 schwarze 123: if (ESCAPE_ERROR == gly)
124: gly = ESCAPE_FONT;
1.34 schwarze 125: switch (**start) {
1.48 schwarze 126: case '(':
1.34 schwarze 127: *start = ++*end;
128: *sz = 2;
1.26 schwarze 129: break;
1.48 schwarze 130: case '[':
1.34 schwarze 131: *start = ++*end;
1.26 schwarze 132: term = ']';
133: break;
134: default:
1.34 schwarze 135: *sz = 1;
1.26 schwarze 136: break;
137: }
138: break;
139:
140: /*
141: * These escapes are of the form \X'Y', where 'X' is the trigger
142: * and 'Y' is any string. These have opaque sub-strings.
1.47 schwarze 143: * The \B and \w escapes are handled in roff.c, roff_res().
1.26 schwarze 144: */
1.48 schwarze 145: case 'A':
146: case 'b':
147: case 'D':
148: case 'R':
149: case 'X':
150: case 'Z':
1.59 schwarze 151: gly = ESCAPE_IGNORE;
152: /* FALLTHROUGH */
153: case 'o':
154: if (**start == '\0')
1.62 schwarze 155: return ESCAPE_ERROR;
1.59 schwarze 156: if (gly == ESCAPE_ERROR)
157: gly = ESCAPE_OVERSTRIKE;
1.46 schwarze 158: term = **start;
1.34 schwarze 159: *start = ++*end;
1.16 schwarze 160: break;
1.26 schwarze 161:
162: /*
163: * These escapes are of the form \X'N', where 'X' is the trigger
164: * and 'N' resolves to a numerical expression.
165: */
1.48 schwarze 166: case 'h':
167: case 'H':
168: case 'L':
169: case 'l':
170: case 'S':
171: case 'v':
172: case 'x':
1.51 schwarze 173: if (strchr(" %&()*+-./0123456789:<=>", **start)) {
1.53 schwarze 174: if ('\0' != **start)
175: ++*end;
1.62 schwarze 176: return ESCAPE_ERROR;
1.51 schwarze 177: }
1.42 schwarze 178: gly = ESCAPE_IGNORE;
1.46 schwarze 179: term = **start;
1.34 schwarze 180: *start = ++*end;
1.26 schwarze 181: break;
1.29 schwarze 182:
183: /*
184: * Special handling for the numbered character escape.
185: * XXX Do any other escapes need similar handling?
186: */
1.48 schwarze 187: case 'N':
1.34 schwarze 188: if ('\0' == **start)
1.62 schwarze 189: return ESCAPE_ERROR;
1.34 schwarze 190: (*end)++;
191: if (isdigit((unsigned char)**start)) {
192: *sz = 1;
1.62 schwarze 193: return ESCAPE_IGNORE;
1.34 schwarze 194: }
195: (*start)++;
1.29 schwarze 196: while (isdigit((unsigned char)**end))
197: (*end)++;
1.34 schwarze 198: *sz = *end - *start;
1.29 schwarze 199: if ('\0' != **end)
200: (*end)++;
1.62 schwarze 201: return ESCAPE_NUMBERED;
1.26 schwarze 202:
1.48 schwarze 203: /*
1.26 schwarze 204: * Sizes get a special category of their own.
205: */
1.48 schwarze 206: case 's':
1.26 schwarze 207: gly = ESCAPE_IGNORE;
1.17 schwarze 208:
1.26 schwarze 209: /* See +/- counts as a sign. */
1.34 schwarze 210: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
1.58 schwarze 211: *start = ++*end;
1.6 schwarze 212:
1.34 schwarze 213: switch (**end) {
1.48 schwarze 214: case '(':
1.34 schwarze 215: *start = ++*end;
216: *sz = 2;
1.16 schwarze 217: break;
1.48 schwarze 218: case '[':
1.34 schwarze 219: *start = ++*end;
1.33 schwarze 220: term = ']';
1.16 schwarze 221: break;
1.48 schwarze 222: case '\'':
1.34 schwarze 223: *start = ++*end;
1.33 schwarze 224: term = '\'';
1.60 schwarze 225: break;
226: case '3':
227: case '2':
228: case '1':
229: *sz = (*end)[-1] == 's' &&
230: isdigit((unsigned char)(*end)[1]) ? 2 : 1;
1.16 schwarze 231: break;
232: default:
1.34 schwarze 233: *sz = 1;
1.16 schwarze 234: break;
1.6 schwarze 235: }
236:
1.26 schwarze 237: break;
238:
239: /*
240: * Anything else is assumed to be a glyph.
1.34 schwarze 241: * In this case, pass back the character after the backslash.
1.26 schwarze 242: */
243: default:
244: gly = ESCAPE_SPECIAL;
1.34 schwarze 245: *start = --*end;
246: *sz = 1;
1.26 schwarze 247: break;
248: }
249:
250: assert(ESCAPE_ERROR != gly);
251:
252: /*
1.33 schwarze 253: * Read up to the terminating character,
254: * paying attention to nested escapes.
1.26 schwarze 255: */
256:
257: if ('\0' != term) {
1.33 schwarze 258: while (**end != term) {
259: switch (**end) {
1.48 schwarze 260: case '\0':
1.62 schwarze 261: return ESCAPE_ERROR;
1.48 schwarze 262: case '\\':
1.33 schwarze 263: (*end)++;
264: if (ESCAPE_ERROR ==
265: mandoc_escape(end, NULL, NULL))
1.62 schwarze 266: return ESCAPE_ERROR;
1.33 schwarze 267: break;
268: default:
269: (*end)++;
270: break;
271: }
272: }
1.34 schwarze 273: *sz = (*end)++ - *start;
1.33 schwarze 274: } else {
1.34 schwarze 275: assert(*sz > 0);
276: if ((size_t)*sz > strlen(*start))
1.62 schwarze 277: return ESCAPE_ERROR;
1.34 schwarze 278: *end += *sz;
1.26 schwarze 279: }
1.19 schwarze 280:
1.26 schwarze 281: /* Run post-processors. */
1.19 schwarze 282:
1.26 schwarze 283: switch (gly) {
1.48 schwarze 284: case ESCAPE_FONT:
1.37 schwarze 285: if (2 == *sz) {
286: if ('C' == **start) {
287: /*
288: * Treat constant-width font modes
289: * just like regular font modes.
290: */
291: (*start)++;
292: (*sz)--;
293: } else {
294: if ('B' == (*start)[0] && 'I' == (*start)[1])
295: gly = ESCAPE_FONTBI;
296: break;
297: }
1.34 schwarze 298: } else if (1 != *sz)
1.26 schwarze 299: break;
1.30 schwarze 300:
1.34 schwarze 301: switch (**start) {
1.48 schwarze 302: case '3':
303: case 'B':
1.26 schwarze 304: gly = ESCAPE_FONTBOLD;
305: break;
1.48 schwarze 306: case '2':
307: case 'I':
1.26 schwarze 308: gly = ESCAPE_FONTITALIC;
1.16 schwarze 309: break;
1.48 schwarze 310: case 'P':
1.26 schwarze 311: gly = ESCAPE_FONTPREV;
1.16 schwarze 312: break;
1.48 schwarze 313: case '1':
314: case 'R':
1.26 schwarze 315: gly = ESCAPE_FONTROMAN;
1.1 schwarze 316: break;
317: }
1.16 schwarze 318: break;
1.48 schwarze 319: case ESCAPE_SPECIAL:
1.34 schwarze 320: if (1 == *sz && 'c' == **start)
1.26 schwarze 321: gly = ESCAPE_NOSPACE;
1.54 schwarze 322: /*
1.55 schwarze 323: * Unicode escapes are defined in groff as \[u0000]
1.54 schwarze 324: * to \[u10FFFF], where the contained value must be
325: * a valid Unicode codepoint. Here, however, only
1.55 schwarze 326: * check the length and range.
1.54 schwarze 327: */
1.55 schwarze 328: if (**start != 'u' || *sz < 5 || *sz > 7)
329: break;
330: if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
331: break;
332: if (*sz == 6 && (*start)[1] == '0')
1.64 ! schwarze 333: break;
! 334: if (*sz == 5 && (*start)[1] == 'D' &&
! 335: strchr("89ABCDEF", (*start)[2]) != NULL)
1.55 schwarze 336: break;
337: if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
1.54 schwarze 338: + 1 == *sz)
339: gly = ESCAPE_UNICODE;
1.16 schwarze 340: break;
1.1 schwarze 341: default:
1.16 schwarze 342: break;
1.1 schwarze 343: }
344:
1.62 schwarze 345: return gly;
1.21 schwarze 346: }
347:
348: /*
349: * Parse a quoted or unquoted roff-style request or macro argument.
350: * Return a pointer to the parsed argument, which is either the original
351: * pointer or advanced by one byte in case the argument is quoted.
1.40 schwarze 352: * NUL-terminate the argument in place.
1.21 schwarze 353: * Collapse pairs of quotes inside quoted arguments.
354: * Advance the argument pointer to the next argument,
1.40 schwarze 355: * or to the NUL byte terminating the argument line.
1.21 schwarze 356: */
357: char *
1.25 schwarze 358: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 359: {
360: char *start, *cp;
361: int quoted, pairs, white;
362:
363: /* Quoting can only start with a new word. */
364: start = *cpp;
1.26 schwarze 365: quoted = 0;
1.21 schwarze 366: if ('"' == *start) {
367: quoted = 1;
368: start++;
1.48 schwarze 369: }
1.21 schwarze 370:
371: pairs = 0;
372: white = 0;
373: for (cp = start; '\0' != *cp; cp++) {
1.36 schwarze 374:
375: /*
376: * Move the following text left
377: * after quoted quotes and after "\\" and "\t".
378: */
1.21 schwarze 379: if (pairs)
380: cp[-pairs] = cp[0];
1.36 schwarze 381:
1.21 schwarze 382: if ('\\' == cp[0]) {
1.36 schwarze 383: /*
384: * In copy mode, translate double to single
385: * backslashes and backslash-t to literal tabs.
386: */
387: switch (cp[1]) {
1.48 schwarze 388: case 't':
1.36 schwarze 389: cp[0] = '\t';
390: /* FALLTHROUGH */
1.48 schwarze 391: case '\\':
1.21 schwarze 392: pairs++;
393: cp++;
1.36 schwarze 394: break;
1.48 schwarze 395: case ' ':
1.21 schwarze 396: /* Skip escaped blanks. */
1.36 schwarze 397: if (0 == quoted)
398: cp++;
399: break;
400: default:
401: break;
402: }
1.21 schwarze 403: } else if (0 == quoted) {
404: if (' ' == cp[0]) {
405: /* Unescaped blanks end unquoted args. */
406: white = 1;
407: break;
408: }
409: } else if ('"' == cp[0]) {
410: if ('"' == cp[1]) {
411: /* Quoted quotes collapse. */
412: pairs++;
413: cp++;
414: } else {
415: /* Unquoted quotes end quoted args. */
416: quoted = 2;
417: break;
418: }
419: }
420: }
421:
422: /* Quoted argument without a closing quote. */
1.25 schwarze 423: if (1 == quoted)
1.52 schwarze 424: mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
1.21 schwarze 425:
1.40 schwarze 426: /* NUL-terminate this argument and move to the next one. */
1.21 schwarze 427: if (pairs)
428: cp[-pairs] = '\0';
429: if ('\0' != *cp) {
430: *cp++ = '\0';
431: while (' ' == *cp)
432: cp++;
433: }
1.24 schwarze 434: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 435: *cpp = cp;
436:
1.25 schwarze 437: if ('\0' == *cp && (white || ' ' == cp[-1]))
1.52 schwarze 438: mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
1.21 schwarze 439:
1.62 schwarze 440: return start;
1.4 schwarze 441: }
1.5 schwarze 442:
443: static int
444: a2time(time_t *t, const char *fmt, const char *p)
445: {
446: struct tm tm;
447: char *pp;
448:
449: memset(&tm, 0, sizeof(struct tm));
450:
451: pp = strptime(p, fmt, &tm);
452: if (NULL != pp && '\0' == *pp) {
453: *t = mktime(&tm);
1.62 schwarze 454: return 1;
1.5 schwarze 455: }
456:
1.62 schwarze 457: return 0;
1.5 schwarze 458: }
459:
1.22 schwarze 460: static char *
461: time2a(time_t t)
462: {
1.28 schwarze 463: struct tm *tm;
1.23 schwarze 464: char *buf, *p;
465: size_t ssz;
1.22 schwarze 466: int isz;
467:
1.28 schwarze 468: tm = localtime(&t);
1.57 schwarze 469: if (tm == NULL)
1.62 schwarze 470: return NULL;
1.22 schwarze 471:
1.23 schwarze 472: /*
473: * Reserve space:
474: * up to 9 characters for the month (September) + blank
475: * up to 2 characters for the day + comma + blank
476: * 4 characters for the year and a terminating '\0'
477: */
478: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
479:
1.28 schwarze 480: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 481: goto fail;
482: p += (int)ssz;
1.22 schwarze 483:
1.28 schwarze 484: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 485: goto fail;
1.22 schwarze 486: p += isz;
487:
1.28 schwarze 488: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 489: goto fail;
1.62 schwarze 490: return buf;
1.23 schwarze 491:
492: fail:
493: free(buf);
1.62 schwarze 494: return NULL;
1.22 schwarze 495: }
496:
497: char *
1.25 schwarze 498: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 499: {
1.22 schwarze 500: char *out;
1.5 schwarze 501: time_t t;
502:
1.22 schwarze 503: if (NULL == in || '\0' == *in ||
504: 0 == strcmp(in, "$" "Mdocdate$")) {
1.49 schwarze 505: mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
1.22 schwarze 506: time(&t);
507: }
1.31 schwarze 508: else if (a2time(&t, "%Y-%m-%d", in))
509: t = 0;
1.22 schwarze 510: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 511: !a2time(&t, "%b %d, %Y", in)) {
1.50 schwarze 512: mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
1.22 schwarze 513: t = 0;
1.5 schwarze 514: }
1.22 schwarze 515: out = t ? time2a(t) : NULL;
1.62 schwarze 516: return out ? out : mandoc_strdup(in);
1.5 schwarze 517: }
518:
1.9 schwarze 519: int
1.43 schwarze 520: mandoc_eos(const char *p, size_t sz)
1.9 schwarze 521: {
1.43 schwarze 522: const char *q;
523: int enclosed, found;
1.9 schwarze 524:
1.10 schwarze 525: if (0 == sz)
1.62 schwarze 526: return 0;
1.9 schwarze 527:
1.11 schwarze 528: /*
529: * End-of-sentence recognition must include situations where
530: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 531: * propagate outward.
1.11 schwarze 532: */
533:
1.43 schwarze 534: enclosed = found = 0;
1.16 schwarze 535: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 536: switch (*q) {
1.48 schwarze 537: case '\"':
538: case '\'':
539: case ']':
540: case ')':
1.15 schwarze 541: if (0 == found)
542: enclosed = 1;
1.11 schwarze 543: break;
1.48 schwarze 544: case '.':
545: case '!':
546: case '?':
1.15 schwarze 547: found = 1;
548: break;
1.11 schwarze 549: default:
1.62 schwarze 550: return found &&
551: (!enclosed || isalnum((unsigned char)*q));
1.11 schwarze 552: }
1.9 schwarze 553: }
554:
1.62 schwarze 555: return found && !enclosed;
1.9 schwarze 556: }
1.26 schwarze 557:
558: /*
559: * Convert a string to a long that may not be <0.
560: * If the string is invalid, or is less than 0, return -1.
561: */
562: int
1.27 schwarze 563: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 564: {
565: char buf[32];
566: char *ep;
567: long v;
568:
569: if (sz > 31)
1.62 schwarze 570: return -1;
1.26 schwarze 571:
572: memcpy(buf, p, sz);
573: buf[(int)sz] = '\0';
574:
575: errno = 0;
576: v = strtol(buf, &ep, base);
577:
578: if (buf[0] == '\0' || *ep != '\0')
1.62 schwarze 579: return -1;
1.26 schwarze 580:
1.27 schwarze 581: if (v > INT_MAX)
582: v = INT_MAX;
583: if (v < INT_MIN)
584: v = INT_MIN;
1.26 schwarze 585:
1.62 schwarze 586: return (int)v;
1.26 schwarze 587: }