Annotation of src/usr.bin/mandoc/apropos_db.c, Revision 1.5
1.5 ! schwarze 1: /* $Id: apropos_db.c,v 1.4 2011/11/16 13:23:27 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.3 schwarze 4: * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
1.1 schwarze 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <fcntl.h>
20: #include <regex.h>
21: #include <stdarg.h>
22: #include <stdlib.h>
23: #include <string.h>
1.5 ! schwarze 24: #include <unistd.h>
1.1 schwarze 25:
26: #ifdef __linux__
27: # include <db_185.h>
28: #else
29: # include <db.h>
30: #endif
31:
1.2 schwarze 32: #include "mandocdb.h"
1.1 schwarze 33: #include "apropos_db.h"
34: #include "mandoc.h"
35:
1.5 ! schwarze 36: struct rectree {
! 37: struct rec *node;
! 38: int len;
! 39: };
! 40:
1.1 schwarze 41: struct expr {
1.3 schwarze 42: int regex;
1.4 schwarze 43: int index;
1.1 schwarze 44: int mask;
1.4 schwarze 45: int and;
1.1 schwarze 46: char *v;
47: regex_t re;
1.4 schwarze 48: struct expr *next;
1.1 schwarze 49: };
50:
51: struct type {
52: int mask;
53: const char *name;
54: };
55:
56: static const struct type types[] = {
1.2 schwarze 57: { TYPE_An, "An" },
58: { TYPE_Cd, "Cd" },
59: { TYPE_Er, "Er" },
60: { TYPE_Ev, "Ev" },
61: { TYPE_Fn, "Fn" },
62: { TYPE_Fn, "Fo" },
63: { TYPE_In, "In" },
64: { TYPE_Nd, "Nd" },
65: { TYPE_Nm, "Nm" },
66: { TYPE_Pa, "Pa" },
67: { TYPE_St, "St" },
68: { TYPE_Va, "Va" },
69: { TYPE_Va, "Vt" },
70: { TYPE_Xr, "Xr" },
71: { INT_MAX, "any" },
1.1 schwarze 72: { 0, NULL }
73: };
74:
75: static DB *btree_open(void);
76: static int btree_read(const DBT *, const struct mchars *, char **);
1.4 schwarze 77: static int exprexecpre(const struct expr *, const char *, int);
78: static void exprexecpost(const struct expr *,
79: const char *, int, int *, size_t);
80: static struct expr *exprterm(char *, int, int);
1.1 schwarze 81: static DB *index_open(void);
82: static int index_read(const DBT *, const DBT *,
83: const struct mchars *, struct rec *);
84: static void norm_string(const char *,
85: const struct mchars *, char **);
86: static size_t norm_utf8(unsigned int, char[7]);
1.4 schwarze 87: static void recfree(struct rec *);
1.5 ! schwarze 88: static void single_search(struct rectree *, const struct opts *,
! 89: const struct expr *, size_t terms,
! 90: struct mchars *);
1.1 schwarze 91:
92: /*
93: * Open the keyword mandoc-db database.
94: */
95: static DB *
96: btree_open(void)
97: {
98: BTREEINFO info;
99: DB *db;
100:
101: memset(&info, 0, sizeof(BTREEINFO));
102: info.flags = R_DUP;
103:
1.2 schwarze 104: db = dbopen(MANDOC_DB, O_RDONLY, 0, DB_BTREE, &info);
1.1 schwarze 105: if (NULL != db)
106: return(db);
107:
108: return(NULL);
109: }
110:
111: /*
112: * Read a keyword from the database and normalise it.
113: * Return 0 if the database is insane, else 1.
114: */
115: static int
116: btree_read(const DBT *v, const struct mchars *mc, char **buf)
117: {
118:
119: /* Sanity: are we nil-terminated? */
120:
121: assert(v->size > 0);
122: if ('\0' != ((char *)v->data)[(int)v->size - 1])
123: return(0);
124:
125: norm_string((char *)v->data, mc, buf);
126: return(1);
127: }
128:
129: /*
130: * Take a Unicode codepoint and produce its UTF-8 encoding.
131: * This isn't the best way to do this, but it works.
132: * The magic numbers are from the UTF-8 packaging.
133: * They're not as scary as they seem: read the UTF-8 spec for details.
134: */
135: static size_t
136: norm_utf8(unsigned int cp, char out[7])
137: {
138: size_t rc;
139:
140: rc = 0;
141:
142: if (cp <= 0x0000007F) {
143: rc = 1;
144: out[0] = (char)cp;
145: } else if (cp <= 0x000007FF) {
146: rc = 2;
147: out[0] = (cp >> 6 & 31) | 192;
148: out[1] = (cp & 63) | 128;
149: } else if (cp <= 0x0000FFFF) {
150: rc = 3;
151: out[0] = (cp >> 12 & 15) | 224;
152: out[1] = (cp >> 6 & 63) | 128;
153: out[2] = (cp & 63) | 128;
154: } else if (cp <= 0x001FFFFF) {
155: rc = 4;
156: out[0] = (cp >> 18 & 7) | 240;
157: out[1] = (cp >> 12 & 63) | 128;
158: out[2] = (cp >> 6 & 63) | 128;
159: out[3] = (cp & 63) | 128;
160: } else if (cp <= 0x03FFFFFF) {
161: rc = 5;
162: out[0] = (cp >> 24 & 3) | 248;
163: out[1] = (cp >> 18 & 63) | 128;
164: out[2] = (cp >> 12 & 63) | 128;
165: out[3] = (cp >> 6 & 63) | 128;
166: out[4] = (cp & 63) | 128;
167: } else if (cp <= 0x7FFFFFFF) {
168: rc = 6;
169: out[0] = (cp >> 30 & 1) | 252;
170: out[1] = (cp >> 24 & 63) | 128;
171: out[2] = (cp >> 18 & 63) | 128;
172: out[3] = (cp >> 12 & 63) | 128;
173: out[4] = (cp >> 6 & 63) | 128;
174: out[5] = (cp & 63) | 128;
175: } else
176: return(0);
177:
178: out[rc] = '\0';
179: return(rc);
180: }
181:
182: /*
183: * Normalise strings from the index and database.
184: * These strings are escaped as defined by mandoc_char(7) along with
185: * other goop in mandoc.h (e.g., soft hyphens).
186: * This function normalises these into a nice UTF-8 string.
187: * Returns 0 if the database is fucked.
188: */
189: static void
190: norm_string(const char *val, const struct mchars *mc, char **buf)
191: {
192: size_t sz, bsz;
193: char utfbuf[7];
194: const char *seq, *cpp;
195: int len, u, pos;
196: enum mandoc_esc esc;
197: static const char res[] = { '\\', '\t',
198: ASCII_NBRSP, ASCII_HYPH, '\0' };
199:
200: /* Pre-allocate by the length of the input */
201:
202: bsz = strlen(val) + 1;
203: *buf = mandoc_realloc(*buf, bsz);
204: pos = 0;
205:
206: while ('\0' != *val) {
207: /*
208: * Halt on the first escape sequence.
209: * This also halts on the end of string, in which case
210: * we just copy, fallthrough, and exit the loop.
211: */
212: if ((sz = strcspn(val, res)) > 0) {
213: memcpy(&(*buf)[pos], val, sz);
214: pos += (int)sz;
215: val += (int)sz;
216: }
217:
218: if (ASCII_HYPH == *val) {
219: (*buf)[pos++] = '-';
220: val++;
221: continue;
222: } else if ('\t' == *val || ASCII_NBRSP == *val) {
223: (*buf)[pos++] = ' ';
224: val++;
225: continue;
226: } else if ('\\' != *val)
227: break;
228:
229: /* Read past the slash. */
230:
231: val++;
232: u = 0;
233:
234: /*
235: * Parse the escape sequence and see if it's a
236: * predefined character or special character.
237: */
238:
239: esc = mandoc_escape(&val, &seq, &len);
240: if (ESCAPE_ERROR == esc)
241: break;
242:
243: /*
244: * XXX - this just does UTF-8, but we need to know
245: * beforehand whether we should do text substitution.
246: */
247:
248: switch (esc) {
249: case (ESCAPE_SPECIAL):
250: if (0 != (u = mchars_spec2cp(mc, seq, len)))
251: break;
252: /* FALLTHROUGH */
253: default:
254: continue;
255: }
256:
257: /*
258: * If we have a Unicode codepoint, try to convert that
259: * to a UTF-8 byte string.
260: */
261:
262: cpp = utfbuf;
263: if (0 == (sz = norm_utf8(u, utfbuf)))
264: continue;
265:
266: /* Copy the rendered glyph into the stream. */
267:
268: sz = strlen(cpp);
269: bsz += sz;
270:
271: *buf = mandoc_realloc(*buf, bsz);
272:
273: memcpy(&(*buf)[pos], cpp, sz);
274: pos += (int)sz;
275: }
276:
277: (*buf)[pos] = '\0';
278: }
279:
280: /*
281: * Open the filename-index mandoc-db database.
282: * Returns NULL if opening failed.
283: */
284: static DB *
285: index_open(void)
286: {
287: DB *db;
288:
1.2 schwarze 289: db = dbopen(MANDOC_IDX, O_RDONLY, 0, DB_RECNO, NULL);
1.1 schwarze 290: if (NULL != db)
291: return(db);
292:
293: return(NULL);
294: }
295:
296: /*
297: * Safely unpack from an index file record into the structure.
298: * Returns 1 if an entry was unpacked, 0 if the database is insane.
299: */
300: static int
301: index_read(const DBT *key, const DBT *val,
302: const struct mchars *mc, struct rec *rec)
303: {
304: size_t left;
305: char *np, *cp;
306:
307: #define INDEX_BREAD(_dst) \
308: do { \
309: if (NULL == (np = memchr(cp, '\0', left))) \
310: return(0); \
311: norm_string(cp, mc, &(_dst)); \
312: left -= (np - cp) + 1; \
313: cp = np + 1; \
314: } while (/* CONSTCOND */ 0)
315:
316: left = val->size;
317: cp = (char *)val->data;
318:
319: rec->rec = *(recno_t *)key->data;
320:
321: INDEX_BREAD(rec->file);
322: INDEX_BREAD(rec->cat);
323: INDEX_BREAD(rec->title);
324: INDEX_BREAD(rec->arch);
325: INDEX_BREAD(rec->desc);
326: return(1);
327: }
328:
329: /*
330: * Search the mandocdb database for the expression "expr".
331: * Filter out by "opts".
332: * Call "res" with the results, which may be zero.
333: */
334: void
1.5 ! schwarze 335: apropos_search(int argc, char *argv[], const struct opts *opts,
! 336: const struct expr *expr, size_t terms, void *arg,
1.4 schwarze 337: void (*res)(struct rec *, size_t, void *))
1.1 schwarze 338: {
1.5 ! schwarze 339: struct rectree tree;
! 340: struct mchars *mc;
! 341: struct rec *recs;
! 342: int i, mlen;
! 343:
! 344: memset(&tree, 0, sizeof(struct rectree));
! 345:
! 346: /* XXX: error out with bad regexp? */
! 347:
! 348: mc = mchars_alloc();
! 349:
! 350: for (i = 0; i < argc; i++) {
! 351: if (chdir(argv[i]))
! 352: continue;
! 353: single_search(&tree, opts, expr, terms, mc);
! 354: }
! 355:
! 356: /*
! 357: * Count the matching files
! 358: * and feed them to the output handler.
! 359: */
! 360:
! 361: for (mlen = i = 0; i < tree.len; i++)
! 362: if (tree.node[i].matches[0])
! 363: mlen++;
! 364: recs = mandoc_malloc(mlen * sizeof(struct rec));
! 365: for (mlen = i = 0; i < tree.len; i++)
! 366: if (tree.node[i].matches[0])
! 367: memcpy(&recs[mlen++], &tree.node[i],
! 368: sizeof(struct rec));
! 369: (*res)(recs, mlen, arg);
! 370: free(recs);
! 371:
! 372: for (i = 0; i < tree.len; i++)
! 373: recfree(&tree.node[i]);
! 374:
! 375: if (mc)
! 376: mchars_free(mc);
! 377: }
! 378:
! 379: static void
! 380: single_search(struct rectree *tree, const struct opts *opts,
! 381: const struct expr *expr, size_t terms,
! 382: struct mchars *mc)
! 383: {
! 384: int root, leaf, mask;
1.1 schwarze 385: DBT key, val;
386: DB *btree, *idx;
387: int ch;
388: char *buf;
389: recno_t rec;
1.5 ! schwarze 390: struct rec *recs;
1.1 schwarze 391: struct rec srec;
392:
393: root = -1;
394: leaf = -1;
395: btree = NULL;
396: idx = NULL;
397: buf = NULL;
1.5 ! schwarze 398: recs = tree->node;
1.1 schwarze 399:
400: memset(&srec, 0, sizeof(struct rec));
401:
402: /* XXX: return fact that we've errored? */
403:
404: if (NULL == (btree = btree_open()))
405: goto out;
406: if (NULL == (idx = index_open()))
407: goto out;
408:
409: while (0 == (ch = (*btree->seq)(btree, &key, &val, R_NEXT))) {
410: /*
411: * Low-water mark for key and value.
412: * The key must have something in it, and the value must
413: * have the correct tags/recno mix.
414: */
415: if (key.size < 2 || 8 != val.size)
416: break;
417: if ( ! btree_read(&key, mc, &buf))
418: break;
419:
1.4 schwarze 420: mask = *(int *)val.data;
421:
422: /*
423: * See if this keyword record matches any of the
424: * expressions we have stored.
425: */
426: if ( ! exprexecpre(expr, buf, mask))
1.1 schwarze 427: continue;
428:
429: memcpy(&rec, val.data + 4, sizeof(recno_t));
430:
431: /*
432: * O(log n) scan for prior records. Since a record
433: * number is unbounded, this has decent performance over
434: * a complex hash function.
435: */
436:
437: for (leaf = root; leaf >= 0; )
438: if (rec > recs[leaf].rec && recs[leaf].rhs >= 0)
439: leaf = recs[leaf].rhs;
440: else if (rec < recs[leaf].rec && recs[leaf].lhs >= 0)
441: leaf = recs[leaf].lhs;
442: else
443: break;
444:
1.4 schwarze 445: if (leaf >= 0 && recs[leaf].rec == rec) {
446: if (0 == recs[leaf].matches[0])
447: exprexecpost
448: (expr, buf, mask,
449: recs[leaf].matches, terms);
1.1 schwarze 450: continue;
1.4 schwarze 451: }
1.1 schwarze 452:
453: /*
454: * Now we actually extract the manpage's metadata from
455: * the index database.
456: */
457:
458: key.data = &rec;
459: key.size = sizeof(recno_t);
460:
461: if (0 != (*idx->get)(idx, &key, &val, 0))
462: break;
463:
464: srec.lhs = srec.rhs = -1;
465: if ( ! index_read(&key, &val, mc, &srec))
466: break;
467:
468: if (opts->cat && strcasecmp(opts->cat, srec.cat))
469: continue;
470: if (opts->arch && strcasecmp(opts->arch, srec.arch))
471: continue;
472:
1.5 ! schwarze 473: tree->node = recs = mandoc_realloc
! 474: (recs, (tree->len + 1) * sizeof(struct rec));
1.1 schwarze 475:
1.5 ! schwarze 476: memcpy(&recs[tree->len], &srec, sizeof(struct rec));
! 477: recs[tree->len].matches =
1.4 schwarze 478: mandoc_calloc(terms + 1, sizeof(int));
479:
480: exprexecpost
481: (expr, buf, mask,
1.5 ! schwarze 482: recs[tree->len].matches, terms);
1.1 schwarze 483:
484: /* Append to our tree. */
485:
486: if (leaf >= 0) {
487: if (rec > recs[leaf].rec)
1.5 ! schwarze 488: recs[leaf].rhs = tree->len;
1.1 schwarze 489: else
1.5 ! schwarze 490: recs[leaf].lhs = tree->len;
1.1 schwarze 491: } else
1.5 ! schwarze 492: root = tree->len;
1.1 schwarze 493:
494: memset(&srec, 0, sizeof(struct rec));
1.5 ! schwarze 495: tree->len++;
1.1 schwarze 496: }
497:
1.5 ! schwarze 498: /* XXX handle database errors? */
1.1 schwarze 499:
500: out:
1.4 schwarze 501: recfree(&srec);
1.1 schwarze 502:
503: if (btree)
504: (*btree->close)(btree);
505: if (idx)
506: (*idx->close)(idx);
507:
508: free(buf);
509: }
510:
1.4 schwarze 511: static void
512: recfree(struct rec *rec)
513: {
514:
515: free(rec->file);
516: free(rec->matches);
517: free(rec->cat);
518: free(rec->title);
519: free(rec->arch);
520: free(rec->desc);
521: }
522:
1.1 schwarze 523: struct expr *
1.4 schwarze 524: exprcomp(int argc, char *argv[], size_t *tt)
525: {
526: struct expr *e, *first, *next;
527: int pos, log;
528:
529: first = next = NULL;
530: (*tt) = 0;
531:
532: for (pos = 0; pos < argc; pos++) {
533: e = next;
534: log = 0;
535:
536: if (0 == strcmp("-a", argv[pos]))
537: log = 1;
538: else if (0 == strcmp("-o", argv[pos]))
539: log = 2;
540:
541: if (log > 0 && ++pos >= argc)
542: goto err;
543:
544: if (0 == strcmp("-i", argv[pos])) {
545: if (++pos >= argc)
546: goto err;
547: next = exprterm(argv[pos], 1, log == 1);
548: } else
549: next = exprterm(argv[pos], 0, log == 1);
550:
551: if (NULL == next)
552: goto err;
553:
554: next->index = (int)(*tt)++;
555:
556: if (NULL == first) {
557: assert(NULL == e);
558: first = next;
559: } else {
560: assert(NULL != e);
561: e->next = next;
562: }
563: }
564:
565: return(first);
566: err:
567: exprfree(first);
568: return(NULL);
569: }
570:
571: static struct expr *
572: exprterm(char *buf, int cs, int and)
1.1 schwarze 573: {
1.4 schwarze 574: struct expr e;
1.1 schwarze 575: struct expr *p;
1.3 schwarze 576: char *key;
1.4 schwarze 577: int i;
1.1 schwarze 578:
1.4 schwarze 579: memset(&e, 0, sizeof(struct expr));
1.1 schwarze 580:
1.4 schwarze 581: e.and = and;
582:
583: /*
584: * Choose regex or substring match.
1.3 schwarze 585: */
586:
1.4 schwarze 587: if (NULL == (e.v = strpbrk(buf, "=~"))) {
1.3 schwarze 588: e.regex = 0;
1.4 schwarze 589: e.v = buf;
1.3 schwarze 590: } else {
591: e.regex = '~' == *e.v;
592: *e.v++ = '\0';
593: }
1.1 schwarze 594:
1.3 schwarze 595: /*
596: * Determine the record types to search for.
597: */
598:
599: e.mask = 0;
1.4 schwarze 600: if (buf < e.v) {
601: while (NULL != (key = strsep(&buf, ","))) {
1.3 schwarze 602: i = 0;
603: while (types[i].mask &&
604: strcmp(types[i].name, key))
605: i++;
606: e.mask |= types[i].mask;
607: }
608: }
609: if (0 == e.mask)
610: e.mask = TYPE_Nm | TYPE_Nd;
1.1 schwarze 611:
1.4 schwarze 612: if (e.regex) {
613: i = REG_EXTENDED | REG_NOSUB | cs ? REG_ICASE : 0;
614: if (regcomp(&e.re, e.v, i))
615: return(NULL);
616: }
1.1 schwarze 617:
1.3 schwarze 618: e.v = mandoc_strdup(e.v);
1.1 schwarze 619:
620: p = mandoc_calloc(1, sizeof(struct expr));
621: memcpy(p, &e, sizeof(struct expr));
622: return(p);
623: }
624:
625: void
626: exprfree(struct expr *p)
627: {
1.4 schwarze 628: struct expr *pp;
629:
630: while (NULL != p) {
631: if (p->regex)
632: regfree(&p->re);
633: free(p->v);
634: pp = p->next;
635: free(p);
636: p = pp;
637: }
638: }
1.1 schwarze 639:
1.4 schwarze 640: /*
641: * See if this expression evaluates to true for any terms.
642: * Return 1 if any expression evaluates to true, else 0.
643: */
644: static int
645: exprexecpre(const struct expr *p, const char *cp, int mask)
646: {
1.1 schwarze 647:
1.4 schwarze 648: for ( ; NULL != p; p = p->next) {
649: if ( ! (mask & p->mask))
650: continue;
651: if (p->regex) {
652: if (0 == regexec(&p->re, cp, 0, NULL, 0))
653: return(1);
654: } else if (NULL != strcasestr(cp, p->v))
655: return(1);
656: }
657: return(0);
1.1 schwarze 658: }
659:
1.4 schwarze 660: /*
661: * First, update the array of terms for which this expression evaluates
662: * to true.
663: * Second, logically evaluate all terms over the updated array of truth
664: * values.
665: * If this evaluates to true, mark the expression as satisfied.
666: */
667: static void
668: exprexecpost(const struct expr *e, const char *cp,
669: int mask, int *matches, size_t matchsz)
1.1 schwarze 670: {
1.4 schwarze 671: const struct expr *p;
672: int match;
1.1 schwarze 673:
1.4 schwarze 674: assert(0 == matches[0]);
675:
676: for (p = e; p; p = p->next) {
677: if ( ! (mask & p->mask))
678: continue;
679: if (p->regex) {
680: if (regexec(&p->re, cp, 0, NULL, 0))
681: continue;
682: } else if (NULL == strcasestr(cp, p->v))
683: continue;
684:
685: matches[p->index + 1] = 1;
686: }
687:
688: for (match = 0, p = e; p && ! match; p = p->next) {
689: match = matches[p->index + 1];
690: for ( ; p->next && p->next->and; p = p->next)
691: match = match && matches[p->next->index + 1];
692: }
1.1 schwarze 693:
1.4 schwarze 694: matches[0] = match;
1.1 schwarze 695: }