Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.47
1.47 ! schwarze 1: /* $Id: mandoc.c,v 1.46 2014/04/07 17:50:43 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.47 ! schwarze 4: * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.45 schwarze 30: #include "mandoc_aux.h"
1.1 schwarze 31: #include "libmandoc.h"
32:
1.22 schwarze 33: #define DATESIZE 32
34:
1.14 schwarze 35: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 36: static char *time2a(time_t);
1.5 schwarze 37:
1.26 schwarze 38:
39: enum mandoc_esc
1.44 schwarze 40: mandoc_escape(const char **end, const char **start, int *sz)
1.26 schwarze 41: {
1.34 schwarze 42: const char *local_start;
43: int local_sz;
44: char term;
1.26 schwarze 45: enum mandoc_esc gly;
46:
1.34 schwarze 47: /*
48: * When the caller doesn't provide return storage,
49: * use local storage.
50: */
51:
52: if (NULL == start)
53: start = &local_start;
54: if (NULL == sz)
55: sz = &local_sz;
56:
57: /*
58: * Beyond the backslash, at least one input character
59: * is part of the escape sequence. With one exception
60: * (see below), that character won't be returned.
61: */
62:
1.26 schwarze 63: gly = ESCAPE_ERROR;
1.34 schwarze 64: *start = ++*end;
65: *sz = 0;
1.33 schwarze 66: term = '\0';
1.26 schwarze 67:
1.34 schwarze 68: switch ((*start)[-1]) {
1.26 schwarze 69: /*
70: * First the glyphs. There are several different forms of
71: * these, but each eventually returns a substring of the glyph
72: * name.
73: */
74: case ('('):
75: gly = ESCAPE_SPECIAL;
1.34 schwarze 76: *sz = 2;
1.26 schwarze 77: break;
78: case ('['):
79: gly = ESCAPE_SPECIAL;
80: /*
81: * Unicode escapes are defined in groff as \[uXXXX] to
82: * \[u10FFFF], where the contained value must be a valid
83: * Unicode codepoint. Here, however, only check whether
84: * it's not a zero-width escape.
85: */
1.34 schwarze 86: if ('u' == (*start)[0] && ']' != (*start)[1])
1.26 schwarze 87: gly = ESCAPE_UNICODE;
88: term = ']';
89: break;
90: case ('C'):
1.34 schwarze 91: if ('\'' != **start)
1.26 schwarze 92: return(ESCAPE_ERROR);
1.34 schwarze 93: *start = ++*end;
1.39 schwarze 94: if ('u' == (*start)[0] && '\'' != (*start)[1])
95: gly = ESCAPE_UNICODE;
96: else
97: gly = ESCAPE_SPECIAL;
1.26 schwarze 98: term = '\'';
99: break;
1.41 schwarze 100:
101: /*
102: * Escapes taking no arguments at all.
103: */
104: case ('d'):
105: /* FALLTHROUGH */
106: case ('u'):
107: return(ESCAPE_IGNORE);
1.32 schwarze 108:
109: /*
110: * The \z escape is supposed to output the following
111: * character without advancing the cursor position.
112: * Since we are mostly dealing with terminal mode,
113: * let us just skip the next character.
114: */
115: case ('z'):
116: return(ESCAPE_SKIPCHAR);
1.1 schwarze 117:
1.26 schwarze 118: /*
119: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
120: * 'X' is the trigger. These have opaque sub-strings.
121: */
122: case ('F'):
1.16 schwarze 123: /* FALLTHROUGH */
1.26 schwarze 124: case ('g'):
1.16 schwarze 125: /* FALLTHROUGH */
1.26 schwarze 126: case ('k'):
1.1 schwarze 127: /* FALLTHROUGH */
1.26 schwarze 128: case ('M'):
1.14 schwarze 129: /* FALLTHROUGH */
1.26 schwarze 130: case ('m'):
1.1 schwarze 131: /* FALLTHROUGH */
1.26 schwarze 132: case ('n'):
1.1 schwarze 133: /* FALLTHROUGH */
1.26 schwarze 134: case ('V'):
1.1 schwarze 135: /* FALLTHROUGH */
1.26 schwarze 136: case ('Y'):
1.29 schwarze 137: gly = ESCAPE_IGNORE;
1.1 schwarze 138: /* FALLTHROUGH */
1.26 schwarze 139: case ('f'):
140: if (ESCAPE_ERROR == gly)
141: gly = ESCAPE_FONT;
1.34 schwarze 142: switch (**start) {
1.26 schwarze 143: case ('('):
1.34 schwarze 144: *start = ++*end;
145: *sz = 2;
1.26 schwarze 146: break;
147: case ('['):
1.34 schwarze 148: *start = ++*end;
1.26 schwarze 149: term = ']';
150: break;
151: default:
1.34 schwarze 152: *sz = 1;
1.26 schwarze 153: break;
154: }
155: break;
156:
157: /*
158: * These escapes are of the form \X'Y', where 'X' is the trigger
159: * and 'Y' is any string. These have opaque sub-strings.
1.47 ! schwarze 160: * The \B and \w escapes are handled in roff.c, roff_res().
1.26 schwarze 161: */
162: case ('A'):
1.13 schwarze 163: /* FALLTHROUGH */
1.26 schwarze 164: case ('b'):
1.1 schwarze 165: /* FALLTHROUGH */
1.16 schwarze 166: case ('D'):
1.1 schwarze 167: /* FALLTHROUGH */
1.26 schwarze 168: case ('o'):
1.1 schwarze 169: /* FALLTHROUGH */
1.26 schwarze 170: case ('R'):
1.42 schwarze 171: /* FALLTHROUGH */
1.26 schwarze 172: case ('X'):
1.1 schwarze 173: /* FALLTHROUGH */
1.26 schwarze 174: case ('Z'):
1.46 schwarze 175: if ('\0' == **start)
1.26 schwarze 176: return(ESCAPE_ERROR);
177: gly = ESCAPE_IGNORE;
1.46 schwarze 178: term = **start;
1.34 schwarze 179: *start = ++*end;
1.16 schwarze 180: break;
1.26 schwarze 181:
182: /*
183: * These escapes are of the form \X'N', where 'X' is the trigger
184: * and 'N' resolves to a numerical expression.
185: */
1.17 schwarze 186: case ('h'):
187: /* FALLTHROUGH */
1.26 schwarze 188: case ('H'):
189: /* FALLTHROUGH */
190: case ('L'):
191: /* FALLTHROUGH */
192: case ('l'):
193: /* FALLTHROUGH */
194: case ('S'):
195: /* FALLTHROUGH */
1.17 schwarze 196: case ('v'):
197: /* FALLTHROUGH */
1.26 schwarze 198: case ('x'):
1.46 schwarze 199: if (strchr("\0 %&()*+-./0123456789:<=>", **start))
1.34 schwarze 200: return(ESCAPE_ERROR);
1.42 schwarze 201: gly = ESCAPE_IGNORE;
1.46 schwarze 202: term = **start;
1.34 schwarze 203: *start = ++*end;
1.26 schwarze 204: break;
1.29 schwarze 205:
206: /*
207: * Special handling for the numbered character escape.
208: * XXX Do any other escapes need similar handling?
209: */
210: case ('N'):
1.34 schwarze 211: if ('\0' == **start)
1.29 schwarze 212: return(ESCAPE_ERROR);
1.34 schwarze 213: (*end)++;
214: if (isdigit((unsigned char)**start)) {
215: *sz = 1;
1.29 schwarze 216: return(ESCAPE_IGNORE);
1.34 schwarze 217: }
218: (*start)++;
1.29 schwarze 219: while (isdigit((unsigned char)**end))
220: (*end)++;
1.34 schwarze 221: *sz = *end - *start;
1.29 schwarze 222: if ('\0' != **end)
223: (*end)++;
224: return(ESCAPE_NUMBERED);
1.26 schwarze 225:
226: /*
227: * Sizes get a special category of their own.
228: */
1.6 schwarze 229: case ('s'):
1.26 schwarze 230: gly = ESCAPE_IGNORE;
1.17 schwarze 231:
1.26 schwarze 232: /* See +/- counts as a sign. */
1.34 schwarze 233: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
234: (*end)++;
1.6 schwarze 235:
1.34 schwarze 236: switch (**end) {
1.16 schwarze 237: case ('('):
1.34 schwarze 238: *start = ++*end;
239: *sz = 2;
1.16 schwarze 240: break;
241: case ('['):
1.34 schwarze 242: *start = ++*end;
1.33 schwarze 243: term = ']';
1.16 schwarze 244: break;
245: case ('\''):
1.34 schwarze 246: *start = ++*end;
1.33 schwarze 247: term = '\'';
1.16 schwarze 248: break;
249: default:
1.34 schwarze 250: *sz = 1;
1.16 schwarze 251: break;
1.6 schwarze 252: }
253:
1.26 schwarze 254: break;
255:
256: /*
257: * Anything else is assumed to be a glyph.
1.34 schwarze 258: * In this case, pass back the character after the backslash.
1.26 schwarze 259: */
260: default:
261: gly = ESCAPE_SPECIAL;
1.34 schwarze 262: *start = --*end;
263: *sz = 1;
1.26 schwarze 264: break;
265: }
266:
267: assert(ESCAPE_ERROR != gly);
268:
269: /*
1.33 schwarze 270: * Read up to the terminating character,
271: * paying attention to nested escapes.
1.26 schwarze 272: */
273:
274: if ('\0' != term) {
1.33 schwarze 275: while (**end != term) {
276: switch (**end) {
277: case ('\0'):
278: return(ESCAPE_ERROR);
279: case ('\\'):
280: (*end)++;
281: if (ESCAPE_ERROR ==
282: mandoc_escape(end, NULL, NULL))
283: return(ESCAPE_ERROR);
284: break;
285: default:
286: (*end)++;
287: break;
288: }
289: }
1.34 schwarze 290: *sz = (*end)++ - *start;
1.33 schwarze 291: } else {
1.34 schwarze 292: assert(*sz > 0);
293: if ((size_t)*sz > strlen(*start))
1.26 schwarze 294: return(ESCAPE_ERROR);
1.34 schwarze 295: *end += *sz;
1.26 schwarze 296: }
1.19 schwarze 297:
1.26 schwarze 298: /* Run post-processors. */
1.19 schwarze 299:
1.26 schwarze 300: switch (gly) {
301: case (ESCAPE_FONT):
1.37 schwarze 302: if (2 == *sz) {
303: if ('C' == **start) {
304: /*
305: * Treat constant-width font modes
306: * just like regular font modes.
307: */
308: (*start)++;
309: (*sz)--;
310: } else {
311: if ('B' == (*start)[0] && 'I' == (*start)[1])
312: gly = ESCAPE_FONTBI;
313: break;
314: }
1.34 schwarze 315: } else if (1 != *sz)
1.26 schwarze 316: break;
1.30 schwarze 317:
1.34 schwarze 318: switch (**start) {
1.26 schwarze 319: case ('3'):
320: /* FALLTHROUGH */
321: case ('B'):
322: gly = ESCAPE_FONTBOLD;
323: break;
324: case ('2'):
325: /* FALLTHROUGH */
326: case ('I'):
327: gly = ESCAPE_FONTITALIC;
1.16 schwarze 328: break;
1.26 schwarze 329: case ('P'):
330: gly = ESCAPE_FONTPREV;
1.16 schwarze 331: break;
1.26 schwarze 332: case ('1'):
333: /* FALLTHROUGH */
334: case ('R'):
335: gly = ESCAPE_FONTROMAN;
1.1 schwarze 336: break;
337: }
1.16 schwarze 338: break;
1.26 schwarze 339: case (ESCAPE_SPECIAL):
1.34 schwarze 340: if (1 == *sz && 'c' == **start)
1.26 schwarze 341: gly = ESCAPE_NOSPACE;
1.16 schwarze 342: break;
1.1 schwarze 343: default:
1.16 schwarze 344: break;
1.1 schwarze 345: }
346:
1.26 schwarze 347: return(gly);
1.21 schwarze 348: }
349:
350: /*
351: * Parse a quoted or unquoted roff-style request or macro argument.
352: * Return a pointer to the parsed argument, which is either the original
353: * pointer or advanced by one byte in case the argument is quoted.
1.40 schwarze 354: * NUL-terminate the argument in place.
1.21 schwarze 355: * Collapse pairs of quotes inside quoted arguments.
356: * Advance the argument pointer to the next argument,
1.40 schwarze 357: * or to the NUL byte terminating the argument line.
1.21 schwarze 358: */
359: char *
1.25 schwarze 360: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 361: {
362: char *start, *cp;
363: int quoted, pairs, white;
364:
365: /* Quoting can only start with a new word. */
366: start = *cpp;
1.26 schwarze 367: quoted = 0;
1.21 schwarze 368: if ('"' == *start) {
369: quoted = 1;
370: start++;
1.26 schwarze 371: }
1.21 schwarze 372:
373: pairs = 0;
374: white = 0;
375: for (cp = start; '\0' != *cp; cp++) {
1.36 schwarze 376:
377: /*
378: * Move the following text left
379: * after quoted quotes and after "\\" and "\t".
380: */
1.21 schwarze 381: if (pairs)
382: cp[-pairs] = cp[0];
1.36 schwarze 383:
1.21 schwarze 384: if ('\\' == cp[0]) {
1.36 schwarze 385: /*
386: * In copy mode, translate double to single
387: * backslashes and backslash-t to literal tabs.
388: */
389: switch (cp[1]) {
390: case ('t'):
391: cp[0] = '\t';
392: /* FALLTHROUGH */
393: case ('\\'):
1.21 schwarze 394: pairs++;
395: cp++;
1.36 schwarze 396: break;
397: case (' '):
1.21 schwarze 398: /* Skip escaped blanks. */
1.36 schwarze 399: if (0 == quoted)
400: cp++;
401: break;
402: default:
403: break;
404: }
1.21 schwarze 405: } else if (0 == quoted) {
406: if (' ' == cp[0]) {
407: /* Unescaped blanks end unquoted args. */
408: white = 1;
409: break;
410: }
411: } else if ('"' == cp[0]) {
412: if ('"' == cp[1]) {
413: /* Quoted quotes collapse. */
414: pairs++;
415: cp++;
416: } else {
417: /* Unquoted quotes end quoted args. */
418: quoted = 2;
419: break;
420: }
421: }
422: }
423:
424: /* Quoted argument without a closing quote. */
1.25 schwarze 425: if (1 == quoted)
426: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 427:
1.40 schwarze 428: /* NUL-terminate this argument and move to the next one. */
1.21 schwarze 429: if (pairs)
430: cp[-pairs] = '\0';
431: if ('\0' != *cp) {
432: *cp++ = '\0';
433: while (' ' == *cp)
434: cp++;
435: }
1.24 schwarze 436: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 437: *cpp = cp;
438:
1.25 schwarze 439: if ('\0' == *cp && (white || ' ' == cp[-1]))
440: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 441:
442: return(start);
1.4 schwarze 443: }
1.5 schwarze 444:
445: static int
446: a2time(time_t *t, const char *fmt, const char *p)
447: {
448: struct tm tm;
449: char *pp;
450:
451: memset(&tm, 0, sizeof(struct tm));
452:
453: pp = strptime(p, fmt, &tm);
454: if (NULL != pp && '\0' == *pp) {
455: *t = mktime(&tm);
456: return(1);
457: }
458:
459: return(0);
460: }
461:
1.22 schwarze 462: static char *
463: time2a(time_t t)
464: {
1.28 schwarze 465: struct tm *tm;
1.23 schwarze 466: char *buf, *p;
467: size_t ssz;
1.22 schwarze 468: int isz;
469:
1.28 schwarze 470: tm = localtime(&t);
1.22 schwarze 471:
1.23 schwarze 472: /*
473: * Reserve space:
474: * up to 9 characters for the month (September) + blank
475: * up to 2 characters for the day + comma + blank
476: * 4 characters for the year and a terminating '\0'
477: */
478: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
479:
1.28 schwarze 480: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 481: goto fail;
482: p += (int)ssz;
1.22 schwarze 483:
1.28 schwarze 484: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 485: goto fail;
1.22 schwarze 486: p += isz;
487:
1.28 schwarze 488: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 489: goto fail;
490: return(buf);
491:
492: fail:
493: free(buf);
494: return(NULL);
1.22 schwarze 495: }
496:
497: char *
1.25 schwarze 498: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 499: {
1.22 schwarze 500: char *out;
1.5 schwarze 501: time_t t;
502:
1.22 schwarze 503: if (NULL == in || '\0' == *in ||
504: 0 == strcmp(in, "$" "Mdocdate$")) {
1.25 schwarze 505: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.22 schwarze 506: time(&t);
507: }
1.31 schwarze 508: else if (a2time(&t, "%Y-%m-%d", in))
509: t = 0;
1.22 schwarze 510: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 511: !a2time(&t, "%b %d, %Y", in)) {
1.25 schwarze 512: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.22 schwarze 513: t = 0;
1.5 schwarze 514: }
1.22 schwarze 515: out = t ? time2a(t) : NULL;
1.23 schwarze 516: return(out ? out : mandoc_strdup(in));
1.5 schwarze 517: }
518:
1.9 schwarze 519: int
1.43 schwarze 520: mandoc_eos(const char *p, size_t sz)
1.9 schwarze 521: {
1.43 schwarze 522: const char *q;
523: int enclosed, found;
1.9 schwarze 524:
1.10 schwarze 525: if (0 == sz)
526: return(0);
1.9 schwarze 527:
1.11 schwarze 528: /*
529: * End-of-sentence recognition must include situations where
530: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 531: * propagate outward.
1.11 schwarze 532: */
533:
1.43 schwarze 534: enclosed = found = 0;
1.16 schwarze 535: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 536: switch (*q) {
1.11 schwarze 537: case ('\"'):
538: /* FALLTHROUGH */
539: case ('\''):
540: /* FALLTHROUGH */
541: case (']'):
542: /* FALLTHROUGH */
543: case (')'):
1.15 schwarze 544: if (0 == found)
545: enclosed = 1;
1.11 schwarze 546: break;
547: case ('.'):
548: /* FALLTHROUGH */
549: case ('!'):
550: /* FALLTHROUGH */
551: case ('?'):
1.15 schwarze 552: found = 1;
553: break;
1.11 schwarze 554: default:
1.20 schwarze 555: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 556: }
1.9 schwarze 557: }
558:
1.15 schwarze 559: return(found && !enclosed);
1.9 schwarze 560: }
1.26 schwarze 561:
562: /*
563: * Convert a string to a long that may not be <0.
564: * If the string is invalid, or is less than 0, return -1.
565: */
566: int
1.27 schwarze 567: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 568: {
569: char buf[32];
570: char *ep;
571: long v;
572:
573: if (sz > 31)
574: return(-1);
575:
576: memcpy(buf, p, sz);
577: buf[(int)sz] = '\0';
578:
579: errno = 0;
580: v = strtol(buf, &ep, base);
581:
582: if (buf[0] == '\0' || *ep != '\0')
583: return(-1);
584:
1.27 schwarze 585: if (v > INT_MAX)
586: v = INT_MAX;
587: if (v < INT_MIN)
588: v = INT_MIN;
1.26 schwarze 589:
590: return((int)v);
591: }