Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.51
1.51 ! schwarze 1: /* $Id: mandoc.c,v 1.50 2014/07/01 22:36:35 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.47 schwarze 4: * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.45 schwarze 30: #include "mandoc_aux.h"
1.1 schwarze 31: #include "libmandoc.h"
32:
1.22 schwarze 33: #define DATESIZE 32
34:
1.14 schwarze 35: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 36: static char *time2a(time_t);
1.5 schwarze 37:
1.26 schwarze 38:
39: enum mandoc_esc
1.44 schwarze 40: mandoc_escape(const char **end, const char **start, int *sz)
1.26 schwarze 41: {
1.34 schwarze 42: const char *local_start;
43: int local_sz;
44: char term;
1.48 schwarze 45: enum mandoc_esc gly;
1.26 schwarze 46:
1.34 schwarze 47: /*
48: * When the caller doesn't provide return storage,
49: * use local storage.
50: */
51:
52: if (NULL == start)
53: start = &local_start;
54: if (NULL == sz)
55: sz = &local_sz;
56:
57: /*
58: * Beyond the backslash, at least one input character
59: * is part of the escape sequence. With one exception
60: * (see below), that character won't be returned.
61: */
62:
1.26 schwarze 63: gly = ESCAPE_ERROR;
1.34 schwarze 64: *start = ++*end;
65: *sz = 0;
1.33 schwarze 66: term = '\0';
1.26 schwarze 67:
1.34 schwarze 68: switch ((*start)[-1]) {
1.26 schwarze 69: /*
70: * First the glyphs. There are several different forms of
71: * these, but each eventually returns a substring of the glyph
72: * name.
73: */
1.48 schwarze 74: case '(':
1.26 schwarze 75: gly = ESCAPE_SPECIAL;
1.34 schwarze 76: *sz = 2;
1.26 schwarze 77: break;
1.48 schwarze 78: case '[':
1.26 schwarze 79: gly = ESCAPE_SPECIAL;
80: /*
81: * Unicode escapes are defined in groff as \[uXXXX] to
82: * \[u10FFFF], where the contained value must be a valid
83: * Unicode codepoint. Here, however, only check whether
84: * it's not a zero-width escape.
85: */
1.34 schwarze 86: if ('u' == (*start)[0] && ']' != (*start)[1])
1.26 schwarze 87: gly = ESCAPE_UNICODE;
88: term = ']';
89: break;
1.48 schwarze 90: case 'C':
1.34 schwarze 91: if ('\'' != **start)
1.26 schwarze 92: return(ESCAPE_ERROR);
1.34 schwarze 93: *start = ++*end;
1.39 schwarze 94: if ('u' == (*start)[0] && '\'' != (*start)[1])
95: gly = ESCAPE_UNICODE;
96: else
97: gly = ESCAPE_SPECIAL;
1.26 schwarze 98: term = '\'';
99: break;
1.41 schwarze 100:
101: /*
102: * Escapes taking no arguments at all.
103: */
1.48 schwarze 104: case 'd':
1.41 schwarze 105: /* FALLTHROUGH */
1.48 schwarze 106: case 'u':
1.41 schwarze 107: return(ESCAPE_IGNORE);
1.32 schwarze 108:
109: /*
110: * The \z escape is supposed to output the following
1.48 schwarze 111: * character without advancing the cursor position.
1.32 schwarze 112: * Since we are mostly dealing with terminal mode,
113: * let us just skip the next character.
114: */
1.48 schwarze 115: case 'z':
1.32 schwarze 116: return(ESCAPE_SKIPCHAR);
1.1 schwarze 117:
1.26 schwarze 118: /*
119: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
120: * 'X' is the trigger. These have opaque sub-strings.
121: */
1.48 schwarze 122: case 'F':
1.16 schwarze 123: /* FALLTHROUGH */
1.48 schwarze 124: case 'g':
1.16 schwarze 125: /* FALLTHROUGH */
1.48 schwarze 126: case 'k':
1.1 schwarze 127: /* FALLTHROUGH */
1.48 schwarze 128: case 'M':
1.14 schwarze 129: /* FALLTHROUGH */
1.48 schwarze 130: case 'm':
1.1 schwarze 131: /* FALLTHROUGH */
1.48 schwarze 132: case 'n':
1.1 schwarze 133: /* FALLTHROUGH */
1.48 schwarze 134: case 'V':
1.1 schwarze 135: /* FALLTHROUGH */
1.48 schwarze 136: case 'Y':
1.29 schwarze 137: gly = ESCAPE_IGNORE;
1.1 schwarze 138: /* FALLTHROUGH */
1.48 schwarze 139: case 'f':
1.26 schwarze 140: if (ESCAPE_ERROR == gly)
141: gly = ESCAPE_FONT;
1.34 schwarze 142: switch (**start) {
1.48 schwarze 143: case '(':
1.34 schwarze 144: *start = ++*end;
145: *sz = 2;
1.26 schwarze 146: break;
1.48 schwarze 147: case '[':
1.34 schwarze 148: *start = ++*end;
1.26 schwarze 149: term = ']';
150: break;
151: default:
1.34 schwarze 152: *sz = 1;
1.26 schwarze 153: break;
154: }
155: break;
156:
157: /*
158: * These escapes are of the form \X'Y', where 'X' is the trigger
159: * and 'Y' is any string. These have opaque sub-strings.
1.47 schwarze 160: * The \B and \w escapes are handled in roff.c, roff_res().
1.26 schwarze 161: */
1.48 schwarze 162: case 'A':
1.13 schwarze 163: /* FALLTHROUGH */
1.48 schwarze 164: case 'b':
1.1 schwarze 165: /* FALLTHROUGH */
1.48 schwarze 166: case 'D':
1.1 schwarze 167: /* FALLTHROUGH */
1.48 schwarze 168: case 'o':
1.1 schwarze 169: /* FALLTHROUGH */
1.48 schwarze 170: case 'R':
1.42 schwarze 171: /* FALLTHROUGH */
1.48 schwarze 172: case 'X':
1.1 schwarze 173: /* FALLTHROUGH */
1.48 schwarze 174: case 'Z':
1.46 schwarze 175: if ('\0' == **start)
1.26 schwarze 176: return(ESCAPE_ERROR);
177: gly = ESCAPE_IGNORE;
1.46 schwarze 178: term = **start;
1.34 schwarze 179: *start = ++*end;
1.16 schwarze 180: break;
1.26 schwarze 181:
182: /*
183: * These escapes are of the form \X'N', where 'X' is the trigger
184: * and 'N' resolves to a numerical expression.
185: */
1.48 schwarze 186: case 'h':
1.17 schwarze 187: /* FALLTHROUGH */
1.48 schwarze 188: case 'H':
1.26 schwarze 189: /* FALLTHROUGH */
1.48 schwarze 190: case 'L':
1.26 schwarze 191: /* FALLTHROUGH */
1.48 schwarze 192: case 'l':
1.26 schwarze 193: /* FALLTHROUGH */
1.48 schwarze 194: case 'S':
1.26 schwarze 195: /* FALLTHROUGH */
1.48 schwarze 196: case 'v':
1.17 schwarze 197: /* FALLTHROUGH */
1.48 schwarze 198: case 'x':
1.51 ! schwarze 199: if (strchr(" %&()*+-./0123456789:<=>", **start)) {
! 200: ++*end;
1.34 schwarze 201: return(ESCAPE_ERROR);
1.51 ! schwarze 202: }
1.42 schwarze 203: gly = ESCAPE_IGNORE;
1.46 schwarze 204: term = **start;
1.34 schwarze 205: *start = ++*end;
1.26 schwarze 206: break;
1.29 schwarze 207:
208: /*
209: * Special handling for the numbered character escape.
210: * XXX Do any other escapes need similar handling?
211: */
1.48 schwarze 212: case 'N':
1.34 schwarze 213: if ('\0' == **start)
1.29 schwarze 214: return(ESCAPE_ERROR);
1.34 schwarze 215: (*end)++;
216: if (isdigit((unsigned char)**start)) {
217: *sz = 1;
1.29 schwarze 218: return(ESCAPE_IGNORE);
1.34 schwarze 219: }
220: (*start)++;
1.29 schwarze 221: while (isdigit((unsigned char)**end))
222: (*end)++;
1.34 schwarze 223: *sz = *end - *start;
1.29 schwarze 224: if ('\0' != **end)
225: (*end)++;
226: return(ESCAPE_NUMBERED);
1.26 schwarze 227:
1.48 schwarze 228: /*
1.26 schwarze 229: * Sizes get a special category of their own.
230: */
1.48 schwarze 231: case 's':
1.26 schwarze 232: gly = ESCAPE_IGNORE;
1.17 schwarze 233:
1.26 schwarze 234: /* See +/- counts as a sign. */
1.34 schwarze 235: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
236: (*end)++;
1.6 schwarze 237:
1.34 schwarze 238: switch (**end) {
1.48 schwarze 239: case '(':
1.34 schwarze 240: *start = ++*end;
241: *sz = 2;
1.16 schwarze 242: break;
1.48 schwarze 243: case '[':
1.34 schwarze 244: *start = ++*end;
1.33 schwarze 245: term = ']';
1.16 schwarze 246: break;
1.48 schwarze 247: case '\'':
1.34 schwarze 248: *start = ++*end;
1.33 schwarze 249: term = '\'';
1.16 schwarze 250: break;
251: default:
1.34 schwarze 252: *sz = 1;
1.16 schwarze 253: break;
1.6 schwarze 254: }
255:
1.26 schwarze 256: break;
257:
258: /*
259: * Anything else is assumed to be a glyph.
1.34 schwarze 260: * In this case, pass back the character after the backslash.
1.26 schwarze 261: */
262: default:
263: gly = ESCAPE_SPECIAL;
1.34 schwarze 264: *start = --*end;
265: *sz = 1;
1.26 schwarze 266: break;
267: }
268:
269: assert(ESCAPE_ERROR != gly);
270:
271: /*
1.33 schwarze 272: * Read up to the terminating character,
273: * paying attention to nested escapes.
1.26 schwarze 274: */
275:
276: if ('\0' != term) {
1.33 schwarze 277: while (**end != term) {
278: switch (**end) {
1.48 schwarze 279: case '\0':
1.33 schwarze 280: return(ESCAPE_ERROR);
1.48 schwarze 281: case '\\':
1.33 schwarze 282: (*end)++;
283: if (ESCAPE_ERROR ==
284: mandoc_escape(end, NULL, NULL))
285: return(ESCAPE_ERROR);
286: break;
287: default:
288: (*end)++;
289: break;
290: }
291: }
1.34 schwarze 292: *sz = (*end)++ - *start;
1.33 schwarze 293: } else {
1.34 schwarze 294: assert(*sz > 0);
295: if ((size_t)*sz > strlen(*start))
1.26 schwarze 296: return(ESCAPE_ERROR);
1.34 schwarze 297: *end += *sz;
1.26 schwarze 298: }
1.19 schwarze 299:
1.26 schwarze 300: /* Run post-processors. */
1.19 schwarze 301:
1.26 schwarze 302: switch (gly) {
1.48 schwarze 303: case ESCAPE_FONT:
1.37 schwarze 304: if (2 == *sz) {
305: if ('C' == **start) {
306: /*
307: * Treat constant-width font modes
308: * just like regular font modes.
309: */
310: (*start)++;
311: (*sz)--;
312: } else {
313: if ('B' == (*start)[0] && 'I' == (*start)[1])
314: gly = ESCAPE_FONTBI;
315: break;
316: }
1.34 schwarze 317: } else if (1 != *sz)
1.26 schwarze 318: break;
1.30 schwarze 319:
1.34 schwarze 320: switch (**start) {
1.48 schwarze 321: case '3':
1.26 schwarze 322: /* FALLTHROUGH */
1.48 schwarze 323: case 'B':
1.26 schwarze 324: gly = ESCAPE_FONTBOLD;
325: break;
1.48 schwarze 326: case '2':
1.26 schwarze 327: /* FALLTHROUGH */
1.48 schwarze 328: case 'I':
1.26 schwarze 329: gly = ESCAPE_FONTITALIC;
1.16 schwarze 330: break;
1.48 schwarze 331: case 'P':
1.26 schwarze 332: gly = ESCAPE_FONTPREV;
1.16 schwarze 333: break;
1.48 schwarze 334: case '1':
1.26 schwarze 335: /* FALLTHROUGH */
1.48 schwarze 336: case 'R':
1.26 schwarze 337: gly = ESCAPE_FONTROMAN;
1.1 schwarze 338: break;
339: }
1.16 schwarze 340: break;
1.48 schwarze 341: case ESCAPE_SPECIAL:
1.34 schwarze 342: if (1 == *sz && 'c' == **start)
1.26 schwarze 343: gly = ESCAPE_NOSPACE;
1.16 schwarze 344: break;
1.1 schwarze 345: default:
1.16 schwarze 346: break;
1.1 schwarze 347: }
348:
1.26 schwarze 349: return(gly);
1.21 schwarze 350: }
351:
352: /*
353: * Parse a quoted or unquoted roff-style request or macro argument.
354: * Return a pointer to the parsed argument, which is either the original
355: * pointer or advanced by one byte in case the argument is quoted.
1.40 schwarze 356: * NUL-terminate the argument in place.
1.21 schwarze 357: * Collapse pairs of quotes inside quoted arguments.
358: * Advance the argument pointer to the next argument,
1.40 schwarze 359: * or to the NUL byte terminating the argument line.
1.21 schwarze 360: */
361: char *
1.25 schwarze 362: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 363: {
364: char *start, *cp;
365: int quoted, pairs, white;
366:
367: /* Quoting can only start with a new word. */
368: start = *cpp;
1.26 schwarze 369: quoted = 0;
1.21 schwarze 370: if ('"' == *start) {
371: quoted = 1;
372: start++;
1.48 schwarze 373: }
1.21 schwarze 374:
375: pairs = 0;
376: white = 0;
377: for (cp = start; '\0' != *cp; cp++) {
1.36 schwarze 378:
379: /*
380: * Move the following text left
381: * after quoted quotes and after "\\" and "\t".
382: */
1.21 schwarze 383: if (pairs)
384: cp[-pairs] = cp[0];
1.36 schwarze 385:
1.21 schwarze 386: if ('\\' == cp[0]) {
1.36 schwarze 387: /*
388: * In copy mode, translate double to single
389: * backslashes and backslash-t to literal tabs.
390: */
391: switch (cp[1]) {
1.48 schwarze 392: case 't':
1.36 schwarze 393: cp[0] = '\t';
394: /* FALLTHROUGH */
1.48 schwarze 395: case '\\':
1.21 schwarze 396: pairs++;
397: cp++;
1.36 schwarze 398: break;
1.48 schwarze 399: case ' ':
1.21 schwarze 400: /* Skip escaped blanks. */
1.36 schwarze 401: if (0 == quoted)
402: cp++;
403: break;
404: default:
405: break;
406: }
1.21 schwarze 407: } else if (0 == quoted) {
408: if (' ' == cp[0]) {
409: /* Unescaped blanks end unquoted args. */
410: white = 1;
411: break;
412: }
413: } else if ('"' == cp[0]) {
414: if ('"' == cp[1]) {
415: /* Quoted quotes collapse. */
416: pairs++;
417: cp++;
418: } else {
419: /* Unquoted quotes end quoted args. */
420: quoted = 2;
421: break;
422: }
423: }
424: }
425:
426: /* Quoted argument without a closing quote. */
1.25 schwarze 427: if (1 == quoted)
428: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 429:
1.40 schwarze 430: /* NUL-terminate this argument and move to the next one. */
1.21 schwarze 431: if (pairs)
432: cp[-pairs] = '\0';
433: if ('\0' != *cp) {
434: *cp++ = '\0';
435: while (' ' == *cp)
436: cp++;
437: }
1.24 schwarze 438: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 439: *cpp = cp;
440:
1.25 schwarze 441: if ('\0' == *cp && (white || ' ' == cp[-1]))
442: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 443:
444: return(start);
1.4 schwarze 445: }
1.5 schwarze 446:
447: static int
448: a2time(time_t *t, const char *fmt, const char *p)
449: {
450: struct tm tm;
451: char *pp;
452:
453: memset(&tm, 0, sizeof(struct tm));
454:
455: pp = strptime(p, fmt, &tm);
456: if (NULL != pp && '\0' == *pp) {
457: *t = mktime(&tm);
458: return(1);
459: }
460:
461: return(0);
462: }
463:
1.22 schwarze 464: static char *
465: time2a(time_t t)
466: {
1.28 schwarze 467: struct tm *tm;
1.23 schwarze 468: char *buf, *p;
469: size_t ssz;
1.22 schwarze 470: int isz;
471:
1.28 schwarze 472: tm = localtime(&t);
1.22 schwarze 473:
1.23 schwarze 474: /*
475: * Reserve space:
476: * up to 9 characters for the month (September) + blank
477: * up to 2 characters for the day + comma + blank
478: * 4 characters for the year and a terminating '\0'
479: */
480: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
481:
1.28 schwarze 482: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 483: goto fail;
484: p += (int)ssz;
1.22 schwarze 485:
1.28 schwarze 486: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 487: goto fail;
1.22 schwarze 488: p += isz;
489:
1.28 schwarze 490: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 491: goto fail;
492: return(buf);
493:
494: fail:
495: free(buf);
496: return(NULL);
1.22 schwarze 497: }
498:
499: char *
1.25 schwarze 500: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 501: {
1.22 schwarze 502: char *out;
1.5 schwarze 503: time_t t;
504:
1.22 schwarze 505: if (NULL == in || '\0' == *in ||
506: 0 == strcmp(in, "$" "Mdocdate$")) {
1.49 schwarze 507: mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
1.22 schwarze 508: time(&t);
509: }
1.31 schwarze 510: else if (a2time(&t, "%Y-%m-%d", in))
511: t = 0;
1.22 schwarze 512: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 513: !a2time(&t, "%b %d, %Y", in)) {
1.50 schwarze 514: mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
1.22 schwarze 515: t = 0;
1.5 schwarze 516: }
1.22 schwarze 517: out = t ? time2a(t) : NULL;
1.23 schwarze 518: return(out ? out : mandoc_strdup(in));
1.5 schwarze 519: }
520:
1.9 schwarze 521: int
1.43 schwarze 522: mandoc_eos(const char *p, size_t sz)
1.9 schwarze 523: {
1.43 schwarze 524: const char *q;
525: int enclosed, found;
1.9 schwarze 526:
1.10 schwarze 527: if (0 == sz)
528: return(0);
1.9 schwarze 529:
1.11 schwarze 530: /*
531: * End-of-sentence recognition must include situations where
532: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 533: * propagate outward.
1.11 schwarze 534: */
535:
1.43 schwarze 536: enclosed = found = 0;
1.16 schwarze 537: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 538: switch (*q) {
1.48 schwarze 539: case '\"':
1.11 schwarze 540: /* FALLTHROUGH */
1.48 schwarze 541: case '\'':
1.11 schwarze 542: /* FALLTHROUGH */
1.48 schwarze 543: case ']':
1.11 schwarze 544: /* FALLTHROUGH */
1.48 schwarze 545: case ')':
1.15 schwarze 546: if (0 == found)
547: enclosed = 1;
1.11 schwarze 548: break;
1.48 schwarze 549: case '.':
1.11 schwarze 550: /* FALLTHROUGH */
1.48 schwarze 551: case '!':
1.11 schwarze 552: /* FALLTHROUGH */
1.48 schwarze 553: case '?':
1.15 schwarze 554: found = 1;
555: break;
1.11 schwarze 556: default:
1.20 schwarze 557: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 558: }
1.9 schwarze 559: }
560:
1.15 schwarze 561: return(found && !enclosed);
1.9 schwarze 562: }
1.26 schwarze 563:
564: /*
565: * Convert a string to a long that may not be <0.
566: * If the string is invalid, or is less than 0, return -1.
567: */
568: int
1.27 schwarze 569: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 570: {
571: char buf[32];
572: char *ep;
573: long v;
574:
575: if (sz > 31)
576: return(-1);
577:
578: memcpy(buf, p, sz);
579: buf[(int)sz] = '\0';
580:
581: errno = 0;
582: v = strtol(buf, &ep, base);
583:
584: if (buf[0] == '\0' || *ep != '\0')
585: return(-1);
586:
1.27 schwarze 587: if (v > INT_MAX)
588: v = INT_MAX;
589: if (v < INT_MIN)
590: v = INT_MIN;
1.26 schwarze 591:
592: return((int)v);
593: }