Annotation of src/usr.bin/join/join.c, Revision 1.2
1.1 deraadt 1: /*-
2: * Copyright (c) 1991 The Regents of the University of California.
3: * All rights reserved.
4: *
5: * This code is derived from software contributed to Berkeley by
6: * Steve Hayman of Indiana University, Michiro Hikida and David
7: * Goodenough.
8: *
9: * Redistribution and use in source and binary forms, with or without
10: * modification, are permitted provided that the following conditions
11: * are met:
12: * 1. Redistributions of source code must retain the above copyright
13: * notice, this list of conditions and the following disclaimer.
14: * 2. Redistributions in binary form must reproduce the above copyright
15: * notice, this list of conditions and the following disclaimer in the
16: * documentation and/or other materials provided with the distribution.
17: * 3. All advertising materials mentioning features or use of this software
18: * must display the following acknowledgement:
19: * This product includes software developed by the University of
20: * California, Berkeley and its contributors.
21: * 4. Neither the name of the University nor the names of its contributors
22: * may be used to endorse or promote products derived from this software
23: * without specific prior written permission.
24: *
25: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35: * SUCH DAMAGE.
36: */
37:
38: #ifndef lint
39: char copyright[] =
40: "@(#) Copyright (c) 1991 The Regents of the University of California.\n\
41: All rights reserved.\n";
42: #endif /* not lint */
43:
44: #ifndef lint
45: /*static char sccsid[] = "from: @(#)join.c 5.1 (Berkeley) 11/18/91";*/
1.2 ! deraadt 46: static char rcsid[] = "$Id: join.c,v 1.1.1.1 1995/10/18 08:45:26 deraadt Exp $";
1.1 deraadt 47: #endif /* not lint */
48:
49: #include <sys/types.h>
50: #include <stdio.h>
51: #include <stdlib.h>
52: #include <string.h>
53: #include <ctype.h>
54: #include <errno.h>
55:
56: /*
57: * There's a structure per input file which encapsulates the state of the
58: * file. We repeatedly read lines from each file until we've read in all
59: * the consecutive lines from the file with a common join field. Then we
60: * compare the set of lines with an equivalent set from the other file.
61: */
62: typedef struct {
63: char *line; /* line */
64: u_long linealloc; /* line allocated count */
65: char **fields; /* line field(s) */
66: u_long fieldcnt; /* line field(s) count */
67: u_long fieldalloc; /* line field(s) allocated count */
68: } LINE;
69:
70: typedef struct {
71: FILE *fp; /* file descriptor */
72: u_long joinf; /* join field (-1, -2, -j) */
73: int unpair; /* output unpairable lines (-a) */
74: int number; /* 1 for file 1, 2 for file 2 */
75:
76: LINE *set; /* set of lines with same field */
77: u_long pushback; /* line on the stack */
78: u_long setcnt; /* set count */
79: u_long setalloc; /* set allocated count */
80: } INPUT;
81: INPUT input1 = { NULL, 0, 0, 1, NULL, -1, 0, 0, },
82: input2 = { NULL, 0, 0, 1, NULL, -1, 0, 0, };
83:
84: typedef struct {
85: u_long fileno; /* file number */
86: u_long fieldno; /* field number */
87: } OLIST;
88: OLIST *olist; /* output field list */
89: u_long olistcnt; /* output field list count */
90: u_long olistalloc; /* output field allocated count */
91:
92: int joinout = 1; /* show lines with matched join fields (-v) */
93: int needsep; /* need separator character */
94: int showusage = 1; /* show usage for usage err() calls */
95: int spans = 1; /* span multiple delimiters (-t) */
96: char *empty; /* empty field replacement string (-e) */
97: char *tabchar = " \t"; /* delimiter characters (-t) */
98:
99: int cmp __P((LINE *, u_long, LINE *, u_long));
100: void enomem __P((void));
101: void err __P((const char *, ...));
102: void fieldarg __P((char *));
103: void joinlines __P((INPUT *, INPUT *));
104: void obsolete __P((char **));
105: void outfield __P((LINE *, u_long));
106: void outoneline __P((INPUT *, LINE *));
107: void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *));
108: void slurp __P((INPUT *));
109: void usage __P((void));
110:
111: int
112: main(argc, argv)
113: int argc;
114: char *argv[];
115: {
116: register INPUT *F1, *F2;
117: int aflag, ch, cval, vflag;
118: char *end;
119:
120: F1 = &input1;
121: F2 = &input2;
122:
123: aflag = vflag = 0;
124: obsolete(argv);
125: while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != EOF) {
126: switch (ch) {
127: case '\01':
128: aflag = 1;
129: F1->unpair = F2->unpair = 1;
130: break;
131: case '1':
132: if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
133: err("-1 option field number less than 1");
134: if (*end)
135: err("illegal field number -- %s", optarg);
136: --F1->joinf;
137: break;
138: case '2':
139: if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
140: err("-2 option field number less than 1");
141: if (*end)
142: err("illegal field number -- %s", optarg);
143: --F2->joinf;
144: break;
145: case 'a':
146: aflag = 1;
147: switch(strtol(optarg, &end, 10)) {
148: case 1:
149: F1->unpair = 1;
150: break;
151: case 2:
152: F2->unpair = 1;
153: break;
154: default:
155: err("-a option file number not 1 or 2");
156: break;
157: }
158: if (*end)
159: err("illegal file number -- %s", optarg);
160: break;
161: case 'e':
162: empty = optarg;
163: break;
164: case 'j':
165: if ((F1->joinf = F2->joinf =
166: strtol(optarg, &end, 10)) < 1)
167: err("-j option field number less than 1");
168: if (*end)
169: err("illegal field number -- %s", optarg);
170: --F1->joinf;
171: --F2->joinf;
172: break;
173: case 'o':
174: fieldarg(optarg);
175: break;
176: case 't':
177: spans = 0;
178: if (strlen(tabchar = optarg) != 1)
179: err("illegal tab character specification");
180: break;
181: case 'v':
182: vflag = 1;
183: joinout = 0;
184: switch(strtol(optarg, &end, 10)) {
185: case 1:
186: F1->unpair = 1;
187: break;
188: case 2:
189: F2->unpair = 1;
190: break;
191: default:
192: err("-v option file number not 1 or 2");
193: break;
194: }
195: if (*end)
196: err("illegal file number -- %s", optarg);
197: break;
198: case '?':
199: default:
200: usage();
201: }
202: }
203: argc -= optind;
204: argv += optind;
205:
206: if (aflag && vflag)
207: err("-a and -v options mutually exclusive");
208:
209: if (argc != 2)
210: usage();
211: showusage = 0;
212:
213: /* Open the files; "-" means stdin. */
214: if (!strcmp(*argv, "-"))
215: F1->fp = stdin;
216: else if ((F1->fp = fopen(*argv, "r")) == NULL)
217: err("%s: %s", *argv, strerror(errno));
218: ++argv;
219: if (!strcmp(*argv, "-"))
220: F2->fp = stdin;
221: else if ((F2->fp = fopen(*argv, "r")) == NULL)
222: err("%s: %s", *argv, strerror(errno));
223: if (F1->fp == stdin && F2->fp == stdin)
224: err("only one input file may be stdin");
225:
226: slurp(F1);
227: slurp(F2);
228: while (F1->setcnt && F2->setcnt) {
229: cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
230: if (cval == 0) {
231: /* Oh joy, oh rapture, oh beauty divine! */
232: if (joinout)
233: joinlines(F1, F2);
234: slurp(F1);
235: slurp(F2);
236: } else if (cval < 0) {
237: /* File 1 takes the lead... */
238: if (F1->unpair)
239: joinlines(F1, NULL);
240: slurp(F1);
241: } else {
242: /* File 2 takes the lead... */
243: if (F2->unpair)
244: joinlines(F2, NULL);
245: slurp(F2);
246: }
247: }
248:
249: /*
250: * Now that one of the files is used up, optionally output any
251: * remaining lines from the other file.
252: */
253: if (F1->unpair)
254: while (F1->setcnt) {
255: joinlines(F1, NULL);
256: slurp(F1);
257: }
258: if (F2->unpair)
259: while (F2->setcnt) {
260: joinlines(F2, NULL);
261: slurp(F2);
262: }
263: exit(0);
264: }
265:
266: void
267: slurp(F)
268: INPUT *F;
269: {
270: register LINE *lp, *lastlp;
271: LINE tmp;
272: size_t len;
273: int cnt;
274: char *bp, *fieldp;
275:
276: /*
277: * Read all of the lines from an input file that have the same
278: * join field.
279: */
280: F->setcnt = 0;
281: for (lastlp = NULL;; ++F->setcnt, lastlp = lp) {
282: /*
283: * If we're out of space to hold line structures, allocate
284: * more. Initialize the structure so that we know that this
285: * is new space.
286: */
287: if (F->setcnt == F->setalloc) {
288: cnt = F->setalloc;
289: F->setalloc += 100;
290: if ((F->set = realloc(F->set,
291: F->setalloc * sizeof(LINE))) == NULL)
292: enomem();
293: bzero(F->set + cnt, 100 * sizeof(LINE *));
294: }
295:
296: /*
297: * Get any pushed back line, else get the next line. Allocate
298: * space as necessary. If taking the line from the stack swap
299: * the two structures so that we don't lose the allocated space.
300: * This could be avoided by doing another level of indirection,
301: * but it's probably okay as is.
302: */
303: lp = &F->set[F->setcnt];
304: if (F->pushback != -1) {
305: tmp = F->set[F->setcnt];
306: F->set[F->setcnt] = F->set[F->pushback];
307: F->set[F->pushback] = tmp;
308: F->pushback = -1;
309: continue;
310: }
311: if ((bp = fgetln(F->fp, &len)) == NULL)
312: return;
313: if (lp->linealloc <= len + 1) {
314: if (lp->linealloc == 0)
315: lp->linealloc = 128;
316: while (lp->linealloc <= len + 1)
317: lp->linealloc *= 2;
318:
319: if ((lp->line = realloc(lp->line,
320: lp->linealloc * sizeof(char))) == NULL)
321: enomem();
322: }
323: bcopy(bp, lp->line, len+1);
324:
325: /* Replace trailing newline, if it exists. */
326: if (bp[len - 1] == '\n')
327: lp->line[len - 1] = '\0';
328: else
329: lp->line[len] = '\0';
330: bp = lp->line;
331:
332: /* Split the line into fields, allocate space as necessary. */
333: lp->fieldcnt = 0;
334: while ((fieldp = strsep(&bp, tabchar)) != NULL) {
335: if (spans && *fieldp == '\0')
336: continue;
337: if (lp->fieldcnt == lp->fieldalloc) {
338: lp->fieldalloc += 100;
339: if ((lp->fields = realloc(lp->fields,
340: lp->fieldalloc * sizeof(char *))) == NULL)
341: enomem();
342: }
343: lp->fields[lp->fieldcnt++] = fieldp;
344: }
345:
346: /* See if the join field value has changed. */
347: if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
348: F->pushback = F->setcnt;
349: break;
350: }
351: }
352: }
353:
354: int
355: cmp(lp1, fieldno1, lp2, fieldno2)
356: LINE *lp1, *lp2;
357: u_long fieldno1, fieldno2;
358: {
1.2 ! deraadt 359: if (fieldno1 >= lp1->fieldcnt)
1.1 deraadt 360: return (lp2->fieldcnt < fieldno2 ? 0 : 1);
1.2 ! deraadt 361: if (fieldno2 >= lp2->fieldcnt)
1.1 deraadt 362: return (-1);
363: return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
364: }
365:
366: void
367: joinlines(F1, F2)
368: register INPUT *F1, *F2;
369: {
370: register int cnt1, cnt2;
371:
372: /*
373: * Output the results of a join comparison. The output may be from
374: * either file 1 or file 2 (in which case the first argument is the
375: * file from which to output) or from both.
376: */
377: if (F2 == NULL) {
378: for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
379: outoneline(F1, &F1->set[cnt1]);
380: return;
381: }
382: for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
383: for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
384: outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
385: }
386:
387: void
388: outoneline(F, lp)
389: INPUT *F;
390: register LINE *lp;
391: {
392: register int cnt;
393:
394: /*
395: * Output a single line from one of the files, according to the
396: * join rules. This happens when we are writing unmatched single
397: * lines. Output empty fields in the right places.
398: */
399: if (olist)
400: for (cnt = 0; cnt < olistcnt; ++cnt) {
401: if (olist[cnt].fileno == F->number)
402: outfield(lp, olist[cnt].fieldno);
403: }
404: else
405: for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
406: outfield(lp, cnt);
407: (void)printf("\n");
408: if (ferror(stdout))
409: err("stdout: %s", strerror(errno));
410: needsep = 0;
411: }
412:
413: void
414: outtwoline(F1, lp1, F2, lp2)
415: register INPUT *F1, *F2;
416: register LINE *lp1, *lp2;
417: {
418: register int cnt;
419:
420: /* Output a pair of lines according to the join list (if any). */
421: if (olist)
422: for (cnt = 0; cnt < olistcnt; ++cnt)
423: if (olist[cnt].fileno == 1)
424: outfield(lp1, olist[cnt].fieldno);
425: else /* if (olist[cnt].fileno == 2) */
426: outfield(lp2, olist[cnt].fieldno);
427: else {
428: /*
429: * Output the join field, then the remaining fields from F1
430: * and F2.
431: */
432: outfield(lp1, F1->joinf);
433: for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
434: if (F1->joinf != cnt)
435: outfield(lp1, cnt);
436: for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
437: if (F2->joinf != cnt)
438: outfield(lp2, cnt);
439: }
440: (void)printf("\n");
441: if (ferror(stdout))
442: err("stdout: %s", strerror(errno));
443: needsep = 0;
444: }
445:
446: void
447: outfield(lp, fieldno)
448: LINE *lp;
449: u_long fieldno;
450: {
451: if (needsep++)
452: (void)printf("%c", *tabchar);
453: if (!ferror(stdout))
454: if (lp->fieldcnt < fieldno) {
455: if (empty != NULL)
456: (void)printf("%s", empty);
457: } else {
458: if (*lp->fields[fieldno] == '\0')
459: return;
460: (void)printf("%s", lp->fields[fieldno]);
461: }
462: if (ferror(stdout))
463: err("stdout: %s", strerror(errno));
464: }
465:
466: /*
467: * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
468: * fields.
469: */
470: void
471: fieldarg(option)
472: char *option;
473: {
474: u_long fieldno;
475: char *end, *token;
476:
477: while ((token = strsep(&option, " \t")) != NULL) {
478: if (*token == '\0')
479: continue;
480: if (token[0] != '1' && token[0] != '2' || token[1] != '.')
481: err("malformed -o option field");
482: fieldno = strtol(token + 2, &end, 10);
483: if (*end)
484: err("malformed -o option field");
485: if (fieldno == 0)
486: err("field numbers are 1 based");
487: if (olistcnt == olistalloc) {
488: olistalloc += 50;
489: if ((olist = realloc(olist,
490: olistalloc * sizeof(OLIST))) == NULL)
491: enomem();
492: }
493: olist[olistcnt].fileno = token[0] - '0';
494: olist[olistcnt].fieldno = fieldno - 1;
495: ++olistcnt;
496: }
497: }
498:
499: void
500: obsolete(argv)
501: char **argv;
502: {
503: int len;
504: char **p, *ap, *t;
505:
506: while (ap = *++argv) {
507: /* Return if "--". */
508: if (ap[0] == '-' && ap[1] == '-')
509: return;
510: switch (ap[1]) {
511: case 'a':
512: /*
513: * The original join allowed "-a", which meant the
514: * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2
515: * only specifies this as "-a 1" and "a -2", so we
516: * have to use another option flag, one that is
517: * unlikely to ever be used or accidentally entered
518: * on the command line. (Well, we could reallocate
519: * the argv array, but that hardly seems worthwhile.)
520: */
521: if (ap[2] == '\0')
522: ap[1] = '\01';
523: break;
524: case 'j':
525: /*
526: * The original join allowed "-j[12] arg" and "-j arg".
527: * Convert the former to "-[12] arg". Don't convert
528: * the latter since getopt(3) can handle it.
529: */
530: switch(ap[2]) {
531: case '1':
532: if (ap[3] != '\0')
533: goto jbad;
534: ap[1] = '1';
535: ap[2] = '\0';
536: break;
537: case '2':
538: if (ap[3] != '\0')
539: goto jbad;
540: ap[1] = '2';
541: ap[2] = '\0';
542: break;
543: case '\0':
544: break;
545: default:
546: jbad: err("illegal option -- %s", ap);
547: usage();
548: }
549: break;
550: case 'o':
551: /*
552: * The original join allowed "-o arg arg". Convert to
553: * "-o arg -o arg".
554: */
555: if (ap[2] != '\0')
556: break;
557: for (p = argv + 2; *p; ++p) {
558: if (p[0][0] != '1' && p[0][0] != '2' ||
559: p[0][1] != '.')
560: break;
561: len = strlen(*p);
562: if (len - 2 != strspn(*p + 2, "0123456789"))
563: break;
564: if ((t = malloc(len + 3)) == NULL)
565: enomem();
566: t[0] = '-';
567: t[1] = 'o';
568: bcopy(*p, t + 2, len + 1);
569: *p = t;
570: }
571: argv = p - 1;
572: break;
573: }
574: }
575: }
576:
577: void
578: enomem()
579: {
580: showusage = 0;
581: err("%s", strerror(errno));
582: }
583:
584: void
585: usage()
586: {
587: (void)fprintf(stderr, "%s%s\n",
588: "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ",
589: "[-2 field]\n [-o list] [-t char] file1 file2");
590: exit(1);
591: }
592:
593: #if __STDC__
594: #include <stdarg.h>
595: #else
596: #include <varargs.h>
597: #endif
598:
599: void
600: #if __STDC__
601: err(const char *fmt, ...)
602: #else
603: err(fmt, va_alist)
604: char *fmt;
605: va_dcl
606: #endif
607: {
608: va_list ap;
609: #if __STDC__
610: va_start(ap, fmt);
611: #else
612: va_start(ap);
613: #endif
614: (void)fprintf(stderr, "join: ");
615: (void)vfprintf(stderr, fmt, ap);
616: va_end(ap);
617: (void)fprintf(stderr, "\n");
618: if (showusage)
619: usage();
620: exit(1);
621: /* NOTREACHED */
622: }