Annotation of src/usr.bin/sort/fsort.c, Revision 1.14
1.14 ! naddy 1: /* $OpenBSD: fsort.c,v 1.13 2006/10/18 23:30:43 millert Exp $ */
1.1 millert 2:
3: /*-
4: * Copyright (c) 1993
5: * The Regents of the University of California. All rights reserved.
6: *
7: * This code is derived from software contributed to Berkeley by
8: * Peter McIlroy.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
1.9 millert 18: * 3. Neither the name of the University nor the names of its contributors
1.1 millert 19: * may be used to endorse or promote products derived from this software
20: * without specific prior written permission.
21: *
22: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32: * SUCH DAMAGE.
33: */
34:
35: #ifndef lint
36: #if 0
37: static char sccsid[] = "@(#)fsort.c 8.1 (Berkeley) 6/6/93";
38: #else
1.14 ! naddy 39: static char rcsid[] = "$OpenBSD: fsort.c,v 1.13 2006/10/18 23:30:43 millert Exp $";
1.1 millert 40: #endif
41: #endif /* not lint */
42:
43: /*
44: * Read in the next bin. If it fits in one segment sort it;
45: * otherwise refine it by segment deeper by one character,
46: * and try again on smaller bins. Sort the final bin at this level
47: * of recursion to keep the head of fstack at 0.
48: * After PANIC passes, abort to merge sort.
1.4 millert 49: */
1.1 millert 50: #include "sort.h"
51: #include "fsort.h"
52:
53: #include <stdlib.h>
54: #include <string.h>
55:
1.14 ! naddy 56: u_char **keylist = 0, *buffer = 0, *linebuf = 0;
! 57: size_t bufsize, linebuf_size;
1.1 millert 58: struct tempfile fstack[MAXFCT];
1.12 deraadt 59: extern char toutpath[];
1.1 millert 60: #define FSORTMAX 4
61: int PANIC = FSORTMAX;
62:
63: void
1.11 deraadt 64: fsort(int binno, int depth, union f_handle infiles, int nfiles, FILE *outfp,
65: struct field *ftbl)
1.1 millert 66: {
1.14 ! naddy 67: u_char *bufend, **keypos, *tmpbuf;
! 68: u_char *weights;
1.1 millert 69: int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
1.5 millert 70: int c, nelem;
1.8 ericj 71: long sizes[NBINS+1];
1.1 millert 72: union f_handle tfiles, mstart = {MAXFCT-16};
1.8 ericj 73: int (*get)(int, union f_handle, int, RECHEADER *,
1.1 millert 74: u_char *, struct field *);
1.8 ericj 75: RECHEADER *crec;
1.1 millert 76: struct field tfield[2];
77: FILE *prevfp, *tailfp[FSORTMAX+1];
78:
79: memset(tailfp, 0, sizeof(tailfp));
80: prevfp = outfp;
81: memset(tfield, 0, sizeof(tfield));
82: if (ftbl[0].flags & R)
83: tfield[0].weights = Rascii;
84: else
85: tfield[0].weights = ascii;
86: tfield[0].icol.num = 1;
87: weights = ftbl[0].weights;
1.14 ! naddy 88: if (!buffer) {
! 89: bufsize = BUFSIZE;
! 90: if ((buffer = malloc(bufsize + 1)) == NULL ||
! 91: (keylist = calloc(MAXNUM, sizeof(u_char *))) == NULL)
! 92: errx(2, "cannot allocate memory");
! 93: if (!SINGL_FLD) {
! 94: linebuf_size = MAXLLEN;
! 95: if ((linebuf = malloc(linebuf_size)) == NULL)
! 96: errx(2, "cannot allocate memory");
! 97: }
! 98: }
1.13 millert 99: bufend = buffer + bufsize;
1.1 millert 100: if (binno >= 0) {
101: tfiles.top = infiles.top + nfiles;
102: get = getnext;
103: } else {
104: tfiles.top = 0;
105: if (SINGL_FLD)
106: get = makeline;
107: else
108: get = makekey;
109: }
110: for (;;) {
111: memset(sizes, 0, sizeof(sizes));
112: c = ntfiles = 0;
113: if (binno == weights[REC_D] &&
114: !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */
115: rd_append(weights[REC_D],
116: infiles, nfiles, prevfp, buffer, bufend);
117: break;
118: } else if (binno == weights[REC_D]) {
119: depth = 0; /* start over on flat weights */
120: ftbl = tfield;
121: weights = ftbl[0].weights;
122: }
123: while (c != EOF) {
124: keypos = keylist;
125: nelem = 0;
126: crec = (RECHEADER *) buffer;
127: while((c = get(binno, infiles, nfiles, crec, bufend,
128: ftbl)) == 0) {
129: *keypos++ = crec->data + depth;
130: if (++nelem == MAXNUM) {
131: c = BUFFEND;
132: break;
133: }
134: crec =(RECHEADER *) ((char *) crec +
135: SALIGN(crec->length) + sizeof(TRECHEADER));
136: }
1.8 ericj 137: /*
138: * buffer was too small for data, allocate
139: * a bigger buffer.
140: */
141: if (c == BUFFEND && nelem == 0) {
142: bufsize *= 2;
1.14 ! naddy 143: buffer = realloc(buffer, bufsize);
! 144: if (!buffer)
1.8 ericj 145: err(2, "failed to realloc buffer");
146: bufend = buffer + bufsize;
147: continue;
148: }
1.1 millert 149: if (c == BUFFEND || ntfiles || mfct) { /* push */
150: if (panic >= PANIC) {
151: fstack[MAXFCT-16+mfct].fp = ftmp();
152: if (radixsort((const u_char **)keylist,
153: nelem, weights, REC_D))
154: err(2, NULL);
155: append(keylist, nelem, depth, fstack[
156: MAXFCT-16+mfct].fp, putrec, ftbl);
157: mfct++;
158: /* reduce number of open files */
159: if (mfct == 16 ||(c == EOF && ntfiles)) {
1.14 ! naddy 160: /*
! 161: * Only copy extra incomplete
! 162: * crec data if there is any.
! 163: */
! 164: int nodata = (bufend
! 165: >= (u_char *)crec
! 166: && bufend <= crec->data);
! 167: size_t sz = 0;
! 168:
! 169: if (!nodata) {
! 170: sz = bufend
! 171: - crec->data;
! 172: tmpbuf = malloc(sz);
! 173: if (tmpbuf == NULL)
! 174: errx(2, "cannot"
! 175: " allocate"
! 176: " memory");
! 177: memmove(tmpbuf,
! 178: crec->data, sz);
! 179: }
! 180:
1.1 millert 181: fstack[tfiles.top + ntfiles].fp
182: = ftmp();
183: fmerge(0, mstart, mfct, geteasy,
184: fstack[tfiles.top+ntfiles].fp,
185: putrec, ftbl);
1.4 millert 186: ntfiles++;
1.1 millert 187: mfct = 0;
1.14 ! naddy 188:
! 189: if (!nodata) {
! 190: memmove(crec->data,
! 191: tmpbuf, sz);
! 192: free(tmpbuf);
! 193: }
1.1 millert 194: }
195: } else {
196: fstack[tfiles.top + ntfiles].fp= ftmp();
197: onepass(keylist, depth, nelem, sizes,
198: weights, fstack[tfiles.top+ntfiles].fp);
1.4 millert 199: ntfiles++;
1.1 millert 200: }
201: }
202: }
203: get = getnext;
204: if (!ntfiles && !mfct) { /* everything in memory--pop */
205: if (nelem > 1 && radixsort((const u_char **)keylist,
206: nelem, weights, REC_D))
207: err(2, NULL);
208: append(keylist, nelem, depth, outfp, putline, ftbl);
209: break; /* pop */
210: }
211: if (panic >= PANIC) {
212: if (!ntfiles)
213: fmerge(0, mstart, mfct, geteasy,
214: outfp, putline, ftbl);
215: else
216: fmerge(0, tfiles, ntfiles, geteasy,
217: outfp, putline, ftbl);
218: break;
219:
220: }
221: total = maxb = lastb = 0; /* find if one bin dominates */
222: for (i = 0; i < NBINS; i++)
223: if (sizes[i]) {
224: if (sizes[i] > sizes[maxb])
225: maxb = i;
226: lastb = i;
227: total += sizes[i];
228: }
229: if (sizes[maxb] < max((total / 2) , BUFSIZE))
230: maxb = lastb; /* otherwise pop after last bin */
231: fstack[tfiles.top].lastb = lastb;
232: fstack[tfiles.top].maxb = maxb;
233:
234: /* start refining next level. */
235: get(-1, tfiles, ntfiles, crec, bufend, 0); /* rewind */
236: for (i = 0; i < maxb; i++) {
237: if (!sizes[i]) /* bin empty; step ahead file offset */
238: get(i, tfiles, ntfiles, crec, bufend, 0);
239: else
240: fsort(i, depth+1, tfiles, ntfiles, outfp, ftbl);
241: }
242: if (lastb != maxb) {
243: if (prevfp != outfp)
244: tailfp[panic] = prevfp;
245: prevfp = ftmp();
246: for (i = maxb+1; i <= lastb; i++)
247: if (!sizes[i])
248: get(i, tfiles, ntfiles, crec, bufend,0);
249: else
250: fsort(i, depth+1, tfiles, ntfiles,
251: prevfp, ftbl);
252: }
253:
254: /* sort biggest (or last) bin at this level */
255: depth++;
256: panic++;
257: binno = maxb;
258: infiles.top = tfiles.top; /* getnext will free tfiles, */
259: nfiles = ntfiles; /* so overwrite them */
260: }
261: if (prevfp != outfp) {
262: concat(outfp, prevfp);
263: fclose(prevfp);
264: }
265: for (i = panic; i >= 0; --i)
266: if (tailfp[i]) {
267: concat(outfp, tailfp[i]);
268: fclose(tailfp[i]);
269: }
270: }
271:
272: /*
1.4 millert 273: * This is one pass of radix exchange, dumping the bins to disk.
1.1 millert 274: */
275: #define swap(a, b, t) t = a, a = b, b = t
276: void
1.11 deraadt 277: onepass(u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp)
1.1 millert 278: {
1.8 ericj 279: size_t tsizes[NBINS+1];
1.1 millert 280: u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp;
1.7 millert 281: static int histo[256];
1.1 millert 282: int *hp;
1.8 ericj 283: int c;
1.1 millert 284: u_char **an, *t, **aj;
1.8 ericj 285: u_char **ak, *r;
1.1 millert 286:
287: memset(tsizes, 0, sizeof(tsizes));
288: depth += sizeof(TRECHEADER);
1.8 ericj 289: an = &a[n];
1.1 millert 290: for (ak = a; ak < an; ak++) {
291: histo[c = tr[**ak]]++;
292: tsizes[c] += ((RECHEADER *) (*ak -= depth))->length;
293: }
294:
295: bin[0] = a;
296: bpmax = bin + 256;
297: tp = top, hp = histo;
298: for (bp = bin; bp < bpmax; bp++) {
299: *tp++ = *(bp+1) = *bp + (c = *hp);
300: *hp++ = 0;
301: if (c <= 1)
302: continue;
303: }
1.4 millert 304: for (aj = a; aj < an; *aj = r, aj = bin[c+1])
1.1 millert 305: for(r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
306: swap(*ak, r, t);
307:
308: for (ak = a, c = 0; c < 256; c++) {
309: an = bin[c+1];
310: n = an - ak;
311: tsizes[c] += n * sizeof(TRECHEADER);
312: /* tell getnext how many elements in this bin, this segment. */
1.8 ericj 313: EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
1.1 millert 314: sizes[c] += tsizes[c];
315: for (; ak < an; ++ak)
316: putrec((RECHEADER *) *ak, fp);
317: }
318: }