Annotation of src/usr.bin/mandoc/mandoc.c, Revision 1.38
1.38 ! schwarze 1: /* $Id: mandoc.c,v 1.37 2013/08/08 20:07:24 schwarze Exp $ */
1.1 schwarze 2: /*
1.24 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.37 schwarze 4: * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.21 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 schwarze 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.21 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 schwarze 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.2 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.26 schwarze 22: #include <errno.h>
23: #include <limits.h>
1.1 schwarze 24: #include <stdlib.h>
1.4 schwarze 25: #include <stdio.h>
26: #include <string.h>
1.5 schwarze 27: #include <time.h>
1.1 schwarze 28:
1.14 schwarze 29: #include "mandoc.h"
1.1 schwarze 30: #include "libmandoc.h"
31:
1.22 schwarze 32: #define DATESIZE 32
33:
1.14 schwarze 34: static int a2time(time_t *, const char *, const char *);
1.22 schwarze 35: static char *time2a(time_t);
1.5 schwarze 36:
1.26 schwarze 37:
38: enum mandoc_esc
1.38 ! schwarze 39: mandoc_escape(const char const **end, const char const **start, int *sz)
1.26 schwarze 40: {
1.34 schwarze 41: const char *local_start;
42: int local_sz;
43: char term;
1.26 schwarze 44: enum mandoc_esc gly;
45:
1.34 schwarze 46: /*
47: * When the caller doesn't provide return storage,
48: * use local storage.
49: */
50:
51: if (NULL == start)
52: start = &local_start;
53: if (NULL == sz)
54: sz = &local_sz;
55:
56: /*
57: * Beyond the backslash, at least one input character
58: * is part of the escape sequence. With one exception
59: * (see below), that character won't be returned.
60: */
61:
1.26 schwarze 62: gly = ESCAPE_ERROR;
1.34 schwarze 63: *start = ++*end;
64: *sz = 0;
1.33 schwarze 65: term = '\0';
1.26 schwarze 66:
1.34 schwarze 67: switch ((*start)[-1]) {
1.26 schwarze 68: /*
69: * First the glyphs. There are several different forms of
70: * these, but each eventually returns a substring of the glyph
71: * name.
72: */
73: case ('('):
74: gly = ESCAPE_SPECIAL;
1.34 schwarze 75: *sz = 2;
1.26 schwarze 76: break;
77: case ('['):
78: gly = ESCAPE_SPECIAL;
79: /*
80: * Unicode escapes are defined in groff as \[uXXXX] to
81: * \[u10FFFF], where the contained value must be a valid
82: * Unicode codepoint. Here, however, only check whether
83: * it's not a zero-width escape.
84: */
1.34 schwarze 85: if ('u' == (*start)[0] && ']' != (*start)[1])
1.26 schwarze 86: gly = ESCAPE_UNICODE;
87: term = ']';
88: break;
89: case ('C'):
1.34 schwarze 90: if ('\'' != **start)
1.26 schwarze 91: return(ESCAPE_ERROR);
92: gly = ESCAPE_SPECIAL;
1.34 schwarze 93: *start = ++*end;
1.26 schwarze 94: term = '\'';
95: break;
1.32 schwarze 96:
97: /*
98: * The \z escape is supposed to output the following
99: * character without advancing the cursor position.
100: * Since we are mostly dealing with terminal mode,
101: * let us just skip the next character.
102: */
103: case ('z'):
104: return(ESCAPE_SKIPCHAR);
1.1 schwarze 105:
1.26 schwarze 106: /*
107: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
108: * 'X' is the trigger. These have opaque sub-strings.
109: */
110: case ('F'):
1.16 schwarze 111: /* FALLTHROUGH */
1.26 schwarze 112: case ('g'):
1.16 schwarze 113: /* FALLTHROUGH */
1.26 schwarze 114: case ('k'):
1.1 schwarze 115: /* FALLTHROUGH */
1.26 schwarze 116: case ('M'):
1.14 schwarze 117: /* FALLTHROUGH */
1.26 schwarze 118: case ('m'):
1.1 schwarze 119: /* FALLTHROUGH */
1.26 schwarze 120: case ('n'):
1.1 schwarze 121: /* FALLTHROUGH */
1.26 schwarze 122: case ('V'):
1.1 schwarze 123: /* FALLTHROUGH */
1.26 schwarze 124: case ('Y'):
1.29 schwarze 125: gly = ESCAPE_IGNORE;
1.1 schwarze 126: /* FALLTHROUGH */
1.26 schwarze 127: case ('f'):
128: if (ESCAPE_ERROR == gly)
129: gly = ESCAPE_FONT;
1.34 schwarze 130: switch (**start) {
1.26 schwarze 131: case ('('):
1.34 schwarze 132: *start = ++*end;
133: *sz = 2;
1.26 schwarze 134: break;
135: case ('['):
1.34 schwarze 136: *start = ++*end;
1.26 schwarze 137: term = ']';
138: break;
139: default:
1.34 schwarze 140: *sz = 1;
1.26 schwarze 141: break;
142: }
143: break;
144:
145: /*
146: * These escapes are of the form \X'Y', where 'X' is the trigger
147: * and 'Y' is any string. These have opaque sub-strings.
148: */
149: case ('A'):
1.13 schwarze 150: /* FALLTHROUGH */
1.26 schwarze 151: case ('b'):
1.1 schwarze 152: /* FALLTHROUGH */
1.16 schwarze 153: case ('D'):
1.1 schwarze 154: /* FALLTHROUGH */
1.26 schwarze 155: case ('o'):
1.1 schwarze 156: /* FALLTHROUGH */
1.26 schwarze 157: case ('R'):
1.1 schwarze 158: /* FALLTHROUGH */
1.26 schwarze 159: case ('X'):
1.1 schwarze 160: /* FALLTHROUGH */
1.26 schwarze 161: case ('Z'):
1.34 schwarze 162: if ('\'' != **start)
1.26 schwarze 163: return(ESCAPE_ERROR);
164: gly = ESCAPE_IGNORE;
1.34 schwarze 165: *start = ++*end;
1.16 schwarze 166: term = '\'';
167: break;
1.26 schwarze 168:
169: /*
170: * These escapes are of the form \X'N', where 'X' is the trigger
171: * and 'N' resolves to a numerical expression.
172: */
173: case ('B'):
174: /* FALLTHROUGH */
1.17 schwarze 175: case ('h'):
176: /* FALLTHROUGH */
1.26 schwarze 177: case ('H'):
178: /* FALLTHROUGH */
179: case ('L'):
180: /* FALLTHROUGH */
181: case ('l'):
1.29 schwarze 182: gly = ESCAPE_NUMBERED;
1.26 schwarze 183: /* FALLTHROUGH */
184: case ('S'):
185: /* FALLTHROUGH */
1.17 schwarze 186: case ('v'):
187: /* FALLTHROUGH */
1.26 schwarze 188: case ('w'):
189: /* FALLTHROUGH */
190: case ('x'):
1.34 schwarze 191: if ('\'' != **start)
192: return(ESCAPE_ERROR);
1.26 schwarze 193: if (ESCAPE_ERROR == gly)
194: gly = ESCAPE_IGNORE;
1.34 schwarze 195: *start = ++*end;
1.33 schwarze 196: term = '\'';
1.26 schwarze 197: break;
1.29 schwarze 198:
199: /*
200: * Special handling for the numbered character escape.
201: * XXX Do any other escapes need similar handling?
202: */
203: case ('N'):
1.34 schwarze 204: if ('\0' == **start)
1.29 schwarze 205: return(ESCAPE_ERROR);
1.34 schwarze 206: (*end)++;
207: if (isdigit((unsigned char)**start)) {
208: *sz = 1;
1.29 schwarze 209: return(ESCAPE_IGNORE);
1.34 schwarze 210: }
211: (*start)++;
1.29 schwarze 212: while (isdigit((unsigned char)**end))
213: (*end)++;
1.34 schwarze 214: *sz = *end - *start;
1.29 schwarze 215: if ('\0' != **end)
216: (*end)++;
217: return(ESCAPE_NUMBERED);
1.26 schwarze 218:
219: /*
220: * Sizes get a special category of their own.
221: */
1.6 schwarze 222: case ('s'):
1.26 schwarze 223: gly = ESCAPE_IGNORE;
1.17 schwarze 224:
1.26 schwarze 225: /* See +/- counts as a sign. */
1.34 schwarze 226: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
227: (*end)++;
1.6 schwarze 228:
1.34 schwarze 229: switch (**end) {
1.16 schwarze 230: case ('('):
1.34 schwarze 231: *start = ++*end;
232: *sz = 2;
1.16 schwarze 233: break;
234: case ('['):
1.34 schwarze 235: *start = ++*end;
1.33 schwarze 236: term = ']';
1.16 schwarze 237: break;
238: case ('\''):
1.34 schwarze 239: *start = ++*end;
1.33 schwarze 240: term = '\'';
1.16 schwarze 241: break;
242: default:
1.34 schwarze 243: *sz = 1;
1.16 schwarze 244: break;
1.6 schwarze 245: }
246:
1.26 schwarze 247: break;
248:
249: /*
250: * Anything else is assumed to be a glyph.
1.34 schwarze 251: * In this case, pass back the character after the backslash.
1.26 schwarze 252: */
253: default:
254: gly = ESCAPE_SPECIAL;
1.34 schwarze 255: *start = --*end;
256: *sz = 1;
1.26 schwarze 257: break;
258: }
259:
260: assert(ESCAPE_ERROR != gly);
261:
262: /*
1.33 schwarze 263: * Read up to the terminating character,
264: * paying attention to nested escapes.
1.26 schwarze 265: */
266:
267: if ('\0' != term) {
1.33 schwarze 268: while (**end != term) {
269: switch (**end) {
270: case ('\0'):
271: return(ESCAPE_ERROR);
272: case ('\\'):
273: (*end)++;
274: if (ESCAPE_ERROR ==
275: mandoc_escape(end, NULL, NULL))
276: return(ESCAPE_ERROR);
277: break;
278: default:
279: (*end)++;
280: break;
281: }
282: }
1.34 schwarze 283: *sz = (*end)++ - *start;
1.33 schwarze 284: } else {
1.34 schwarze 285: assert(*sz > 0);
286: if ((size_t)*sz > strlen(*start))
1.26 schwarze 287: return(ESCAPE_ERROR);
1.34 schwarze 288: *end += *sz;
1.26 schwarze 289: }
1.19 schwarze 290:
1.26 schwarze 291: /* Run post-processors. */
1.19 schwarze 292:
1.26 schwarze 293: switch (gly) {
294: case (ESCAPE_FONT):
1.37 schwarze 295: if (2 == *sz) {
296: if ('C' == **start) {
297: /*
298: * Treat constant-width font modes
299: * just like regular font modes.
300: */
301: (*start)++;
302: (*sz)--;
303: } else {
304: if ('B' == (*start)[0] && 'I' == (*start)[1])
305: gly = ESCAPE_FONTBI;
306: break;
307: }
1.34 schwarze 308: } else if (1 != *sz)
1.26 schwarze 309: break;
1.30 schwarze 310:
1.34 schwarze 311: switch (**start) {
1.26 schwarze 312: case ('3'):
313: /* FALLTHROUGH */
314: case ('B'):
315: gly = ESCAPE_FONTBOLD;
316: break;
317: case ('2'):
318: /* FALLTHROUGH */
319: case ('I'):
320: gly = ESCAPE_FONTITALIC;
1.16 schwarze 321: break;
1.26 schwarze 322: case ('P'):
323: gly = ESCAPE_FONTPREV;
1.16 schwarze 324: break;
1.26 schwarze 325: case ('1'):
326: /* FALLTHROUGH */
327: case ('R'):
328: gly = ESCAPE_FONTROMAN;
1.1 schwarze 329: break;
330: }
1.16 schwarze 331: break;
1.26 schwarze 332: case (ESCAPE_SPECIAL):
1.34 schwarze 333: if (1 == *sz && 'c' == **start)
1.26 schwarze 334: gly = ESCAPE_NOSPACE;
1.16 schwarze 335: break;
1.1 schwarze 336: default:
1.16 schwarze 337: break;
1.1 schwarze 338: }
339:
1.26 schwarze 340: return(gly);
1.1 schwarze 341: }
342:
1.4 schwarze 343: void *
344: mandoc_calloc(size_t num, size_t size)
345: {
346: void *ptr;
347:
348: ptr = calloc(num, size);
349: if (NULL == ptr) {
350: perror(NULL);
1.20 schwarze 351: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 352: }
353:
354: return(ptr);
355: }
356:
357:
358: void *
359: mandoc_malloc(size_t size)
360: {
361: void *ptr;
362:
363: ptr = malloc(size);
364: if (NULL == ptr) {
365: perror(NULL);
1.20 schwarze 366: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 367: }
368:
369: return(ptr);
370: }
371:
372:
373: void *
374: mandoc_realloc(void *ptr, size_t size)
375: {
376:
377: ptr = realloc(ptr, size);
378: if (NULL == ptr) {
379: perror(NULL);
1.20 schwarze 380: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 381: }
382:
383: return(ptr);
384: }
385:
1.27 schwarze 386: char *
387: mandoc_strndup(const char *ptr, size_t sz)
388: {
389: char *p;
390:
391: p = mandoc_malloc(sz + 1);
392: memcpy(p, ptr, sz);
393: p[(int)sz] = '\0';
394: return(p);
395: }
1.4 schwarze 396:
397: char *
398: mandoc_strdup(const char *ptr)
399: {
400: char *p;
401:
402: p = strdup(ptr);
403: if (NULL == p) {
404: perror(NULL);
1.20 schwarze 405: exit((int)MANDOCLEVEL_SYSERR);
1.4 schwarze 406: }
407:
408: return(p);
1.21 schwarze 409: }
410:
411: /*
412: * Parse a quoted or unquoted roff-style request or macro argument.
413: * Return a pointer to the parsed argument, which is either the original
414: * pointer or advanced by one byte in case the argument is quoted.
415: * Null-terminate the argument in place.
416: * Collapse pairs of quotes inside quoted arguments.
417: * Advance the argument pointer to the next argument,
418: * or to the null byte terminating the argument line.
419: */
420: char *
1.25 schwarze 421: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.21 schwarze 422: {
423: char *start, *cp;
424: int quoted, pairs, white;
425:
426: /* Quoting can only start with a new word. */
427: start = *cpp;
1.26 schwarze 428: quoted = 0;
1.21 schwarze 429: if ('"' == *start) {
430: quoted = 1;
431: start++;
1.26 schwarze 432: }
1.21 schwarze 433:
434: pairs = 0;
435: white = 0;
436: for (cp = start; '\0' != *cp; cp++) {
1.36 schwarze 437:
438: /*
439: * Move the following text left
440: * after quoted quotes and after "\\" and "\t".
441: */
1.21 schwarze 442: if (pairs)
443: cp[-pairs] = cp[0];
1.36 schwarze 444:
1.21 schwarze 445: if ('\\' == cp[0]) {
1.36 schwarze 446: /*
447: * In copy mode, translate double to single
448: * backslashes and backslash-t to literal tabs.
449: */
450: switch (cp[1]) {
451: case ('t'):
452: cp[0] = '\t';
453: /* FALLTHROUGH */
454: case ('\\'):
1.21 schwarze 455: pairs++;
456: cp++;
1.36 schwarze 457: break;
458: case (' '):
1.21 schwarze 459: /* Skip escaped blanks. */
1.36 schwarze 460: if (0 == quoted)
461: cp++;
462: break;
463: default:
464: break;
465: }
1.21 schwarze 466: } else if (0 == quoted) {
467: if (' ' == cp[0]) {
468: /* Unescaped blanks end unquoted args. */
469: white = 1;
470: break;
471: }
472: } else if ('"' == cp[0]) {
473: if ('"' == cp[1]) {
474: /* Quoted quotes collapse. */
475: pairs++;
476: cp++;
477: } else {
478: /* Unquoted quotes end quoted args. */
479: quoted = 2;
480: break;
481: }
482: }
483: }
484:
485: /* Quoted argument without a closing quote. */
1.25 schwarze 486: if (1 == quoted)
487: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.21 schwarze 488:
489: /* Null-terminate this argument and move to the next one. */
490: if (pairs)
491: cp[-pairs] = '\0';
492: if ('\0' != *cp) {
493: *cp++ = '\0';
494: while (' ' == *cp)
495: cp++;
496: }
1.24 schwarze 497: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.21 schwarze 498: *cpp = cp;
499:
1.25 schwarze 500: if ('\0' == *cp && (white || ' ' == cp[-1]))
501: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.21 schwarze 502:
503: return(start);
1.4 schwarze 504: }
1.5 schwarze 505:
506: static int
507: a2time(time_t *t, const char *fmt, const char *p)
508: {
509: struct tm tm;
510: char *pp;
511:
512: memset(&tm, 0, sizeof(struct tm));
513:
514: pp = strptime(p, fmt, &tm);
515: if (NULL != pp && '\0' == *pp) {
516: *t = mktime(&tm);
517: return(1);
518: }
519:
520: return(0);
521: }
522:
1.22 schwarze 523: static char *
524: time2a(time_t t)
525: {
1.28 schwarze 526: struct tm *tm;
1.23 schwarze 527: char *buf, *p;
528: size_t ssz;
1.22 schwarze 529: int isz;
530:
1.28 schwarze 531: tm = localtime(&t);
1.22 schwarze 532:
1.23 schwarze 533: /*
534: * Reserve space:
535: * up to 9 characters for the month (September) + blank
536: * up to 2 characters for the day + comma + blank
537: * 4 characters for the year and a terminating '\0'
538: */
539: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
540:
1.28 schwarze 541: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.23 schwarze 542: goto fail;
543: p += (int)ssz;
1.22 schwarze 544:
1.28 schwarze 545: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.23 schwarze 546: goto fail;
1.22 schwarze 547: p += isz;
548:
1.28 schwarze 549: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.23 schwarze 550: goto fail;
551: return(buf);
552:
553: fail:
554: free(buf);
555: return(NULL);
1.22 schwarze 556: }
557:
558: char *
1.25 schwarze 559: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.5 schwarze 560: {
1.22 schwarze 561: char *out;
1.5 schwarze 562: time_t t;
563:
1.22 schwarze 564: if (NULL == in || '\0' == *in ||
565: 0 == strcmp(in, "$" "Mdocdate$")) {
1.25 schwarze 566: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.22 schwarze 567: time(&t);
568: }
1.31 schwarze 569: else if (a2time(&t, "%Y-%m-%d", in))
570: t = 0;
1.22 schwarze 571: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.31 schwarze 572: !a2time(&t, "%b %d, %Y", in)) {
1.25 schwarze 573: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.22 schwarze 574: t = 0;
1.5 schwarze 575: }
1.22 schwarze 576: out = t ? time2a(t) : NULL;
1.23 schwarze 577: return(out ? out : mandoc_strdup(in));
1.5 schwarze 578: }
579:
1.9 schwarze 580: int
1.15 schwarze 581: mandoc_eos(const char *p, size_t sz, int enclosed)
1.9 schwarze 582: {
1.15 schwarze 583: const char *q;
1.16 schwarze 584: int found;
1.9 schwarze 585:
1.10 schwarze 586: if (0 == sz)
587: return(0);
1.9 schwarze 588:
1.11 schwarze 589: /*
590: * End-of-sentence recognition must include situations where
591: * some symbols, such as `)', allow prior EOS punctuation to
1.26 schwarze 592: * propagate outward.
1.11 schwarze 593: */
594:
1.16 schwarze 595: found = 0;
596: for (q = p + (int)sz - 1; q >= p; q--) {
1.15 schwarze 597: switch (*q) {
1.11 schwarze 598: case ('\"'):
599: /* FALLTHROUGH */
600: case ('\''):
601: /* FALLTHROUGH */
602: case (']'):
603: /* FALLTHROUGH */
604: case (')'):
1.15 schwarze 605: if (0 == found)
606: enclosed = 1;
1.11 schwarze 607: break;
608: case ('.'):
609: /* FALLTHROUGH */
610: case ('!'):
611: /* FALLTHROUGH */
612: case ('?'):
1.15 schwarze 613: found = 1;
614: break;
1.11 schwarze 615: default:
1.20 schwarze 616: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.11 schwarze 617: }
1.9 schwarze 618: }
619:
1.15 schwarze 620: return(found && !enclosed);
1.9 schwarze 621: }
1.26 schwarze 622:
623: /*
624: * Convert a string to a long that may not be <0.
625: * If the string is invalid, or is less than 0, return -1.
626: */
627: int
1.27 schwarze 628: mandoc_strntoi(const char *p, size_t sz, int base)
1.26 schwarze 629: {
630: char buf[32];
631: char *ep;
632: long v;
633:
634: if (sz > 31)
635: return(-1);
636:
637: memcpy(buf, p, sz);
638: buf[(int)sz] = '\0';
639:
640: errno = 0;
641: v = strtol(buf, &ep, base);
642:
643: if (buf[0] == '\0' || *ep != '\0')
644: return(-1);
645:
1.27 schwarze 646: if (v > INT_MAX)
647: v = INT_MAX;
648: if (v < INT_MIN)
649: v = INT_MIN;
1.26 schwarze 650:
651: return((int)v);
652: }