Annotation of src/usr.bin/sort/fsort.c, Revision 1.10
1.10 ! sturm 1: /* $OpenBSD: fsort.c,v 1.9 2003/06/03 02:56:16 millert Exp $ */
1.1 millert 2:
3: /*-
4: * Copyright (c) 1993
5: * The Regents of the University of California. All rights reserved.
6: *
7: * This code is derived from software contributed to Berkeley by
8: * Peter McIlroy.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
1.9 millert 18: * 3. Neither the name of the University nor the names of its contributors
1.1 millert 19: * may be used to endorse or promote products derived from this software
20: * without specific prior written permission.
21: *
22: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32: * SUCH DAMAGE.
33: */
34:
35: #ifndef lint
36: #if 0
37: static char sccsid[] = "@(#)fsort.c 8.1 (Berkeley) 6/6/93";
38: #else
1.10 ! sturm 39: static char rcsid[] = "$OpenBSD: fsort.c,v 1.9 2003/06/03 02:56:16 millert Exp $";
1.1 millert 40: #endif
41: #endif /* not lint */
42:
43: /*
44: * Read in the next bin. If it fits in one segment sort it;
45: * otherwise refine it by segment deeper by one character,
46: * and try again on smaller bins. Sort the final bin at this level
47: * of recursion to keep the head of fstack at 0.
48: * After PANIC passes, abort to merge sort.
1.4 millert 49: */
1.1 millert 50: #include "sort.h"
51: #include "fsort.h"
52:
53: #include <stdlib.h>
54: #include <string.h>
55:
56: u_char **keylist = 0, *buffer = 0, *linebuf = 0;
1.8 ericj 57: size_t bufsize, linebuf_size;
1.1 millert 58: struct tempfile fstack[MAXFCT];
59: extern char *toutpath;
60: #define FSORTMAX 4
61: int PANIC = FSORTMAX;
62:
63: void
64: fsort(binno, depth, infiles, nfiles, outfp, ftbl)
1.8 ericj 65: int binno, depth;
66: union f_handle infiles;
67: int nfiles;
1.1 millert 68: FILE *outfp;
1.8 ericj 69: struct field *ftbl;
1.1 millert 70: {
1.8 ericj 71: u_char *bufend, **keypos, *tmpbuf;
1.1 millert 72: u_char *weights;
73: int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
1.5 millert 74: int c, nelem;
1.8 ericj 75: long sizes[NBINS+1];
1.1 millert 76: union f_handle tfiles, mstart = {MAXFCT-16};
1.8 ericj 77: int (*get)(int, union f_handle, int, RECHEADER *,
1.1 millert 78: u_char *, struct field *);
1.8 ericj 79: RECHEADER *crec;
1.1 millert 80: struct field tfield[2];
81: FILE *prevfp, *tailfp[FSORTMAX+1];
82:
83: memset(tailfp, 0, sizeof(tailfp));
84: prevfp = outfp;
85: memset(tfield, 0, sizeof(tfield));
86: if (ftbl[0].flags & R)
87: tfield[0].weights = Rascii;
88: else
89: tfield[0].weights = ascii;
90: tfield[0].icol.num = 1;
91: weights = ftbl[0].weights;
92: if (!buffer) {
1.8 ericj 93: bufsize = BUFSIZE;
94: if ((buffer = malloc(bufsize + 1)) == NULL ||
1.7 millert 95: (keylist = calloc(MAXNUM, sizeof(u_char *))) == NULL)
96: errx(2, "cannot allocate memory");
97: if (!SINGL_FLD) {
1.8 ericj 98: linebuf_size = MAXLLEN;
99: if ((linebuf = malloc(linebuf_size)) == NULL)
1.7 millert 100: errx(2, "cannot allocate memory");
101: }
1.1 millert 102: }
1.8 ericj 103: bufend = buffer + bufsize;
1.1 millert 104: if (binno >= 0) {
105: tfiles.top = infiles.top + nfiles;
106: get = getnext;
107: } else {
108: tfiles.top = 0;
109: if (SINGL_FLD)
110: get = makeline;
111: else
112: get = makekey;
113: }
114: for (;;) {
115: memset(sizes, 0, sizeof(sizes));
116: c = ntfiles = 0;
117: if (binno == weights[REC_D] &&
118: !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */
119: rd_append(weights[REC_D],
120: infiles, nfiles, prevfp, buffer, bufend);
121: break;
122: } else if (binno == weights[REC_D]) {
123: depth = 0; /* start over on flat weights */
124: ftbl = tfield;
125: weights = ftbl[0].weights;
126: }
127: while (c != EOF) {
128: keypos = keylist;
129: nelem = 0;
130: crec = (RECHEADER *) buffer;
131: while((c = get(binno, infiles, nfiles, crec, bufend,
132: ftbl)) == 0) {
133: *keypos++ = crec->data + depth;
134: if (++nelem == MAXNUM) {
135: c = BUFFEND;
136: break;
137: }
138: crec =(RECHEADER *) ((char *) crec +
139: SALIGN(crec->length) + sizeof(TRECHEADER));
140: }
1.8 ericj 141: /*
142: * buffer was too small for data, allocate
143: * a bigger buffer.
144: */
145: if (c == BUFFEND && nelem == 0) {
146: bufsize *= 2;
147: buffer = realloc(buffer, bufsize);
148: if (!buffer)
149: err(2, "failed to realloc buffer");
150: bufend = buffer + bufsize;
151: continue;
152: }
1.1 millert 153: if (c == BUFFEND || ntfiles || mfct) { /* push */
154: if (panic >= PANIC) {
155: fstack[MAXFCT-16+mfct].fp = ftmp();
156: if (radixsort((const u_char **)keylist,
157: nelem, weights, REC_D))
158: err(2, NULL);
159: append(keylist, nelem, depth, fstack[
160: MAXFCT-16+mfct].fp, putrec, ftbl);
161: mfct++;
162: /* reduce number of open files */
163: if (mfct == 16 ||(c == EOF && ntfiles)) {
1.10 ! sturm 164: /*
! 165: * Only copy extra incomplete
! 166: * crec data if there is any.
! 167: */
! 168: int nodata = (bufend
! 169: >= (u_char *)crec
! 170: && bufend <= crec->data);
! 171: size_t sz = 0;
! 172:
! 173: if (!nodata) {
! 174: sz = bufend
! 175: - crec->data;
! 176: tmpbuf = malloc(sz);
! 177: if (tmpbuf == NULL)
! 178: errx(2, "cannot"
! 179: " allocate"
! 180: " memory");
! 181: memmove(tmpbuf,
! 182: crec->data, sz);
! 183: }
! 184:
1.1 millert 185: fstack[tfiles.top + ntfiles].fp
186: = ftmp();
187: fmerge(0, mstart, mfct, geteasy,
188: fstack[tfiles.top+ntfiles].fp,
189: putrec, ftbl);
1.4 millert 190: ntfiles++;
1.1 millert 191: mfct = 0;
1.10 ! sturm 192:
! 193: if (!nodata) {
! 194: memmove(crec->data,
! 195: tmpbuf, sz);
! 196: free(tmpbuf);
! 197: }
1.1 millert 198: }
199: } else {
200: fstack[tfiles.top + ntfiles].fp= ftmp();
201: onepass(keylist, depth, nelem, sizes,
202: weights, fstack[tfiles.top+ntfiles].fp);
1.4 millert 203: ntfiles++;
1.1 millert 204: }
205: }
206: }
207: get = getnext;
208: if (!ntfiles && !mfct) { /* everything in memory--pop */
209: if (nelem > 1 && radixsort((const u_char **)keylist,
210: nelem, weights, REC_D))
211: err(2, NULL);
212: append(keylist, nelem, depth, outfp, putline, ftbl);
213: break; /* pop */
214: }
215: if (panic >= PANIC) {
216: if (!ntfiles)
217: fmerge(0, mstart, mfct, geteasy,
218: outfp, putline, ftbl);
219: else
220: fmerge(0, tfiles, ntfiles, geteasy,
221: outfp, putline, ftbl);
222: break;
223:
224: }
225: total = maxb = lastb = 0; /* find if one bin dominates */
226: for (i = 0; i < NBINS; i++)
227: if (sizes[i]) {
228: if (sizes[i] > sizes[maxb])
229: maxb = i;
230: lastb = i;
231: total += sizes[i];
232: }
233: if (sizes[maxb] < max((total / 2) , BUFSIZE))
234: maxb = lastb; /* otherwise pop after last bin */
235: fstack[tfiles.top].lastb = lastb;
236: fstack[tfiles.top].maxb = maxb;
237:
238: /* start refining next level. */
239: get(-1, tfiles, ntfiles, crec, bufend, 0); /* rewind */
240: for (i = 0; i < maxb; i++) {
241: if (!sizes[i]) /* bin empty; step ahead file offset */
242: get(i, tfiles, ntfiles, crec, bufend, 0);
243: else
244: fsort(i, depth+1, tfiles, ntfiles, outfp, ftbl);
245: }
246: if (lastb != maxb) {
247: if (prevfp != outfp)
248: tailfp[panic] = prevfp;
249: prevfp = ftmp();
250: for (i = maxb+1; i <= lastb; i++)
251: if (!sizes[i])
252: get(i, tfiles, ntfiles, crec, bufend,0);
253: else
254: fsort(i, depth+1, tfiles, ntfiles,
255: prevfp, ftbl);
256: }
257:
258: /* sort biggest (or last) bin at this level */
259: depth++;
260: panic++;
261: binno = maxb;
262: infiles.top = tfiles.top; /* getnext will free tfiles, */
263: nfiles = ntfiles; /* so overwrite them */
264: }
265: if (prevfp != outfp) {
266: concat(outfp, prevfp);
267: fclose(prevfp);
268: }
269: for (i = panic; i >= 0; --i)
270: if (tailfp[i]) {
271: concat(outfp, tailfp[i]);
272: fclose(tailfp[i]);
273: }
274: }
275:
276: /*
1.4 millert 277: * This is one pass of radix exchange, dumping the bins to disk.
1.1 millert 278: */
279: #define swap(a, b, t) t = a, a = b, b = t
280: void
281: onepass(a, depth, n, sizes, tr, fp)
282: u_char **a;
283: int depth;
1.8 ericj 284: long n;
285: long sizes[];
1.1 millert 286: u_char *tr;
287: FILE *fp;
288: {
1.8 ericj 289: size_t tsizes[NBINS+1];
1.1 millert 290: u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp;
1.7 millert 291: static int histo[256];
1.1 millert 292: int *hp;
1.8 ericj 293: int c;
1.1 millert 294: u_char **an, *t, **aj;
1.8 ericj 295: u_char **ak, *r;
1.1 millert 296:
297: memset(tsizes, 0, sizeof(tsizes));
298: depth += sizeof(TRECHEADER);
1.8 ericj 299: an = &a[n];
1.1 millert 300: for (ak = a; ak < an; ak++) {
301: histo[c = tr[**ak]]++;
302: tsizes[c] += ((RECHEADER *) (*ak -= depth))->length;
303: }
304:
305: bin[0] = a;
306: bpmax = bin + 256;
307: tp = top, hp = histo;
308: for (bp = bin; bp < bpmax; bp++) {
309: *tp++ = *(bp+1) = *bp + (c = *hp);
310: *hp++ = 0;
311: if (c <= 1)
312: continue;
313: }
1.4 millert 314: for (aj = a; aj < an; *aj = r, aj = bin[c+1])
1.1 millert 315: for(r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
316: swap(*ak, r, t);
317:
318: for (ak = a, c = 0; c < 256; c++) {
319: an = bin[c+1];
320: n = an - ak;
321: tsizes[c] += n * sizeof(TRECHEADER);
322: /* tell getnext how many elements in this bin, this segment. */
1.8 ericj 323: EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
1.1 millert 324: sizes[c] += tsizes[c];
325: for (; ak < an; ++ak)
326: putrec((RECHEADER *) *ak, fp);
327: }
328: }