Annotation of src/usr.bin/sort/file.c, Revision 1.3
1.3 ! millert 1: /* $OpenBSD: file.c,v 1.2 2015/03/17 17:49:27 millert Exp $ */
1.1 millert 2:
3: /*-
4: * Copyright (C) 2009 Gabor Kovesdan <gabor@FreeBSD.org>
5: * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
6: * All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
28: */
29:
30: #include <sys/mman.h>
31: #include <sys/stat.h>
32: #include <sys/types.h>
33: #include <sys/queue.h>
34:
35: #include <err.h>
36: #include <fcntl.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <unistd.h>
41: #include <wchar.h>
42: #include <wctype.h>
43:
44: #include "coll.h"
45: #include "file.h"
46: #include "radixsort.h"
47:
48: unsigned long long free_memory = 1000000;
49: unsigned long long available_free_memory = 1000000;
50:
51: bool use_mmap;
52:
53: const char *tmpdir = "/var/tmp";
54: const char *compress_program;
55:
56: size_t max_open_files = 16;
57:
58: /*
59: * How much space we read from file at once
60: */
61: #define READ_CHUNK 4096
62:
63: /*
64: * File reader structure
65: */
66: struct file_reader {
67: struct reader_buffer rb;
68: FILE *file;
69: char *fname;
70: unsigned char *buffer;
71: unsigned char *mmapaddr;
72: unsigned char *mmapptr;
73: size_t bsz;
74: size_t cbsz;
75: size_t mmapsize;
76: size_t strbeg;
77: int fd;
78: char elsymb;
79: };
80:
81: /*
82: * Structure to be used in file merge process.
83: */
84: struct file_header {
85: struct file_reader *fr;
86: struct sort_list_item *si; /* current top line */
87: size_t file_pos;
88: };
89:
90: /*
91: * List elements of "cleanable" files list.
92: */
93: struct CLEANABLE_FILE {
94: char *fn;
95: LIST_ENTRY(CLEANABLE_FILE) files;
96: };
97:
98: /*
99: * List header of "cleanable" files list.
100: */
101: static LIST_HEAD(CLEANABLE_FILES, CLEANABLE_FILE) tmp_files;
102:
103: /*
104: * Init tmp files list
105: */
106: void
107: init_tmp_files(void)
108: {
109:
110: LIST_INIT(&tmp_files);
111: }
112:
113: /*
114: * Save name of a tmp file for signal cleanup
115: */
116: void
117: tmp_file_atexit(const char *tmp_file)
118: {
119:
120: if (tmp_file) {
121: struct CLEANABLE_FILE *item =
122: sort_malloc(sizeof(struct CLEANABLE_FILE));
123: item->fn = sort_strdup(tmp_file);
124: LIST_INSERT_HEAD(&tmp_files, item, files);
125: }
126: }
127:
128: /*
129: * Clear tmp files
130: */
131: void
132: clear_tmp_files(void)
133: {
134: struct CLEANABLE_FILE *item;
135:
136: LIST_FOREACH(item, &tmp_files, files) {
137: if (item != NULL && item->fn != NULL)
138: unlink(item->fn);
139: }
140: }
141:
142: /*
143: * Check whether a file is a temporary file
144: */
145: static bool
146: file_is_tmp(const char *fn)
147: {
148: struct CLEANABLE_FILE *item;
149: bool ret = false;
150:
151: if (fn) {
152: LIST_FOREACH(item, &tmp_files, files) {
153: if (item != NULL && item->fn != NULL)
154: if (strcmp(item->fn, fn) == 0) {
155: ret = true;
156: break;
157: }
158: }
159: }
160:
161: return ret;
162: }
163:
164: /*
165: * Generate new temporary file name
166: */
167: char *
168: new_tmp_file_name(void)
169: {
170: static size_t tfcounter = 0;
171: static const char *fn = ".bsdsort.";
172: char *ret;
173: int len;
174:
175: len = asprintf(&ret, "%s/%s%d.%lu", tmpdir, fn, (int)getpid(),
176: (unsigned long)(tfcounter++));
177: if (len == -1)
178: err(2, NULL);
179: tmp_file_atexit(ret);
180: return ret;
181: }
182:
183: /*
184: * Initialize file list
185: */
186: void
187: file_list_init(struct file_list *fl, bool tmp)
188: {
189:
190: if (fl) {
191: fl->count = 0;
192: fl->sz = 0;
193: fl->fns = NULL;
194: fl->tmp = tmp;
195: }
196: }
197:
198: /*
199: * Add a file name to the list
200: */
201: void
202: file_list_add(struct file_list *fl, char *fn, bool allocate)
203: {
204:
205: if (fl && fn) {
206: if (fl->count >= fl->sz || fl->fns == NULL) {
207: fl->sz = fl->sz * 2 + 1;
208: fl->fns = sort_reallocarray(fl->fns, fl->sz,
209: sizeof(char *));
210: }
211: fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn;
212: fl->count += 1;
213: }
214: }
215:
216: /*
217: * Populate file list from array of file names
218: */
219: void
220: file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate)
221: {
222:
223: if (fl && argv) {
224: int i;
225:
226: for (i = 0; i < argc; i++)
227: file_list_add(fl, argv[i], allocate);
228: }
229: }
230:
231: /*
232: * Clean file list data and delete the files,
233: * if this is a list of temporary files
234: */
235: void
236: file_list_clean(struct file_list *fl)
237: {
238:
239: if (fl) {
240: if (fl->fns) {
241: size_t i;
242:
243: for (i = 0; i < fl->count; i++) {
244: if (fl->fns[i]) {
245: if (fl->tmp)
246: unlink(fl->fns[i]);
247: sort_free(fl->fns[i]);
248: fl->fns[i] = 0;
249: }
250: }
251: sort_free(fl->fns);
252: fl->fns = NULL;
253: }
254: fl->sz = 0;
255: fl->count = 0;
256: fl->tmp = false;
257: }
258: }
259:
260: /*
261: * Init sort list
262: */
263: void
264: sort_list_init(struct sort_list *l)
265: {
266:
267: if (l) {
268: l->count = 0;
269: l->size = 0;
270: l->memsize = sizeof(struct sort_list);
271: l->list = NULL;
272: }
273: }
274:
275: /*
276: * Add string to sort list
277: */
278: void
279: sort_list_add(struct sort_list *l, struct bwstring *str)
280: {
281:
282: if (l && str) {
283: size_t indx = l->count;
284:
285: if ((l->list == NULL) || (indx >= l->size)) {
286: size_t newsize = (l->size + 1) + 1024;
287:
288: l->list = sort_reallocarray(l->list, newsize,
289: sizeof(struct sort_list_item *));
290: l->memsize += (newsize - l->size) *
291: sizeof(struct sort_list_item *);
292: l->size = newsize;
293: }
294: l->list[indx] = sort_list_item_alloc();
295: sort_list_item_set(l->list[indx], str);
296: l->memsize += sort_list_item_size(l->list[indx]);
297: l->count += 1;
298: }
299: }
300:
301: /*
302: * Clean sort list data
303: */
304: void
305: sort_list_clean(struct sort_list *l)
306: {
307:
308: if (l) {
309: if (l->list) {
310: size_t i;
311:
312: for (i = 0; i < l->count; i++) {
313: struct sort_list_item *item;
314:
315: item = l->list[i];
316:
317: if (item) {
318: sort_list_item_clean(item);
319: sort_free(item);
320: l->list[i] = NULL;
321: }
322: }
323: sort_free(l->list);
324: l->list = NULL;
325: }
326: l->count = 0;
327: l->size = 0;
328: l->memsize = sizeof(struct sort_list);
329: }
330: }
331:
332: /*
333: * Write sort list to file
334: */
335: void
336: sort_list_dump(struct sort_list *l, const char *fn)
337: {
338:
339: if (l && fn) {
340: FILE *f;
341:
342: f = openfile(fn, "w");
343: if (f == NULL)
344: err(2, "%s", fn);
345:
346: if (l->list) {
347: size_t i;
348: if (!sort_opts_vals.uflag) {
349: for (i = 0; i < l->count; ++i)
350: bwsfwrite(l->list[i]->str, f,
351: sort_opts_vals.zflag);
352: } else {
353: struct sort_list_item *last_printed_item = NULL;
354: struct sort_list_item *item;
355: for (i = 0; i < l->count; ++i) {
356: item = l->list[i];
357: if ((last_printed_item == NULL) ||
358: list_coll(&last_printed_item, &item)) {
359: bwsfwrite(item->str, f, sort_opts_vals.zflag);
360: last_printed_item = item;
361: }
362: }
363: }
364: }
365:
366: closefile(f, fn);
367: }
368: }
369:
370: /*
371: * Checks if the given file is sorted. Stops at the first disorder,
372: * prints the disordered line and returns 1.
373: */
374: int
375: check(const char *fn)
376: {
377: struct bwstring *s1, *s2, *s1disorder, *s2disorder;
378: struct file_reader *fr;
379: struct keys_array *ka1, *ka2;
380: int res;
381: size_t pos, posdisorder;
382:
383: s1 = s2 = s1disorder = s2disorder = NULL;
384: ka1 = ka2 = NULL;
385:
386: fr = file_reader_init(fn);
387:
388: res = 0;
389: pos = 1;
390: posdisorder = 1;
391:
392: if (fr == NULL) {
393: err(2, "%s", fn);
394: goto end;
395: }
396:
397: s1 = file_reader_readline(fr);
398: if (s1 == NULL)
399: goto end;
400:
401: ka1 = keys_array_alloc();
402: preproc(s1, ka1);
403:
404: s2 = file_reader_readline(fr);
405: if (s2 == NULL)
406: goto end;
407:
408: ka2 = keys_array_alloc();
409: preproc(s2, ka2);
410:
411: for (;;) {
412:
413: if (debug_sort) {
414: bwsprintf(stdout, s2, "s1=<", ">");
415: bwsprintf(stdout, s1, "s2=<", ">");
416: }
417: int cmp = key_coll(ka2, ka1, 0);
418: if (debug_sort)
419: printf("; cmp1=%d", cmp);
420:
421: if (!cmp && sort_opts_vals.complex_sort &&
422: !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag)) {
423: cmp = top_level_str_coll(s2, s1);
424: if (debug_sort)
425: printf("; cmp2=%d", cmp);
426: }
427: if (debug_sort)
428: printf("\n");
429:
430: if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) {
431: if (!(sort_opts_vals.csilentflag)) {
432: s2disorder = bwsdup(s2);
433: posdisorder = pos;
434: if (debug_sort)
435: s1disorder = bwsdup(s1);
436: }
437: res = 1;
438: goto end;
439: }
440:
441: pos++;
442:
443: clean_keys_array(s1, ka1);
444: sort_free(ka1);
445: ka1 = ka2;
446: ka2 = NULL;
447:
448: bwsfree(s1);
449: s1 = s2;
450:
451: s2 = file_reader_readline(fr);
452: if (s2 == NULL)
453: goto end;
454:
455: ka2 = keys_array_alloc();
456: preproc(s2, ka2);
457: }
458:
459: end:
460: if (ka1) {
461: clean_keys_array(s1, ka1);
462: sort_free(ka1);
463: }
464:
465: if (s1)
466: bwsfree(s1);
467:
468: if (ka2) {
469: clean_keys_array(s2, ka2);
470: sort_free(ka2);
471: }
472:
473: if (s2)
474: bwsfree(s2);
475:
476: if (fn == NULL || *fn == 0 || strcmp(fn, "-") == 0) {
477: for (;;) {
478: s2 = file_reader_readline(fr);
479: if (s2 == NULL)
480: break;
481: bwsfree(s2);
482: }
483: }
484:
485: file_reader_free(fr);
486:
487: if (s2disorder) {
488: bws_disorder_warnx(s2disorder, fn, posdisorder);
489: if (s1disorder) {
490: bws_disorder_warnx(s1disorder, fn, posdisorder);
491: if (s1disorder != s2disorder)
492: bwsfree(s1disorder);
493: }
494: bwsfree(s2disorder);
495: s1disorder = NULL;
496: s2disorder = NULL;
497: }
498:
499: if (res)
500: exit(res);
501:
502: return 0;
503: }
504:
505: /*
506: * Opens a file. If the given filename is "-", stdout will be
507: * opened.
508: */
509: FILE *
510: openfile(const char *fn, const char *mode)
511: {
512: FILE *file;
513:
514: if (strcmp(fn, "-") == 0) {
515: return (mode && mode[0] == 'r') ? stdin : stdout;
516: } else {
517: mode_t orig_file_mask = 0;
518: int is_tmp = file_is_tmp(fn);
519:
520: if (is_tmp && (mode[0] == 'w'))
521: orig_file_mask = umask(S_IWGRP | S_IWOTH |
522: S_IRGRP | S_IROTH);
523:
524: if (is_tmp && (compress_program != NULL)) {
525: char *cmd;
526: int len;
527:
528: fflush(stdout);
529:
530: if (mode[0] == 'r')
531: len = asprintf(&cmd, "cat %s | %s -d",
532: fn, compress_program);
533: else if (mode[0] == 'w')
534: len = asprintf(&cmd, "%s > %s",
535: compress_program, fn);
536: else
537: err(2, "Wrong file mode");
538:
539: if (len == -1)
540: err(2, NULL);
541:
542: if ((file = popen(cmd, mode)) == NULL)
543: err(2, NULL);
544:
545: sort_free(cmd);
546:
547: } else if ((file = fopen(fn, mode)) == NULL)
548: err(2, "%s", fn);
549:
550: if (is_tmp && (mode[0] == 'w'))
551: umask(orig_file_mask);
552: }
553:
554: return file;
555: }
556:
557: /*
558: * Close file
559: */
560: void
561: closefile(FILE *f, const char *fn)
562: {
563: if (f == NULL) {
564: ;
565: } else if (f == stdin) {
566: ;
567: } else if (f == stdout) {
568: fflush(f);
569: } else {
570: if (file_is_tmp(fn) && compress_program != NULL) {
571: if (pclose(f) < 0)
572: err(2, NULL);
573: } else
574: fclose(f);
575: }
576: }
577:
578: /*
579: * Reads a file into the internal buffer.
580: */
581: struct file_reader *
582: file_reader_init(const char *fsrc)
583: {
584: struct file_reader *ret;
585:
586: if (fsrc == NULL)
587: fsrc = "-";
588:
1.2 millert 589: ret = sort_calloc(1, sizeof(struct file_reader));
1.1 millert 590:
591: ret->elsymb = '\n';
592: if (sort_opts_vals.zflag)
593: ret->elsymb = 0;
594:
595: ret->fname = sort_strdup(fsrc);
596:
597: if (strcmp(fsrc, "-") && (compress_program == NULL) && use_mmap) {
598: struct stat stat_buf;
599: void *addr;
600: size_t sz = 0;
601: int fd;
602:
603: fd = open(fsrc, O_RDONLY);
604: if (fd < 0)
605: err(2, "%s", fsrc);
606:
607: if (fstat(fd, &stat_buf) < 0)
608: err(2, "%s", fsrc);
609: sz = stat_buf.st_size;
610:
611: addr = mmap(NULL, sz, PROT_READ, 0, fd, 0);
612: if (addr == MAP_FAILED) {
613: close(fd);
614: } else {
615: ret->fd = fd;
616: ret->mmapaddr = addr;
617: ret->mmapsize = sz;
618: ret->mmapptr = ret->mmapaddr;
619: posix_madvise(addr, sz, POSIX_MADV_SEQUENTIAL);
620: }
621: }
622:
623: if (ret->mmapaddr == NULL) {
624: ret->file = openfile(fsrc, "r");
625: if (ret->file == NULL)
626: err(2, "%s", fsrc);
627:
628: if (strcmp(fsrc, "-")) {
629: ret->cbsz = READ_CHUNK;
630: ret->buffer = sort_malloc(ret->cbsz);
631: ret->bsz = 0;
632: ret->strbeg = 0;
633:
634: ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file);
635: if (ret->bsz == 0) {
636: if (ferror(ret->file))
637: err(2, NULL);
638: }
639: }
640: }
641:
642: return ret;
643: }
644:
645: struct bwstring *
646: file_reader_readline(struct file_reader *fr)
647: {
648: struct bwstring *ret = NULL;
649:
650: if (fr->mmapaddr) {
651: unsigned char *mmapend;
652:
653: mmapend = fr->mmapaddr + fr->mmapsize;
654: if (fr->mmapptr >= mmapend)
655: return NULL;
656: else {
657: unsigned char *strend;
658: size_t sz;
659:
660: sz = mmapend - fr->mmapptr;
661: strend = memchr(fr->mmapptr, fr->elsymb, sz);
662:
663: if (strend == NULL) {
664: ret = bwscsbdup(fr->mmapptr, sz);
665: fr->mmapptr = mmapend;
666: } else {
667: ret = bwscsbdup(fr->mmapptr, strend -
668: fr->mmapptr);
669: fr->mmapptr = strend + 1;
670: }
671: }
672:
673: } else if (fr->file != stdin) {
674: unsigned char *strend;
675: size_t bsz1, remsz, search_start;
676:
677: search_start = 0;
678: remsz = 0;
679: strend = NULL;
680:
681: if (fr->bsz > fr->strbeg)
682: remsz = fr->bsz - fr->strbeg;
683:
684: /* line read cycle */
685: for (;;) {
686: if (remsz > search_start)
687: strend = memchr(fr->buffer + fr->strbeg +
688: search_start, fr->elsymb, remsz -
689: search_start);
690: else
691: strend = NULL;
692:
693: if (strend)
694: break;
695: if (feof(fr->file))
696: break;
697:
698: if (fr->bsz != fr->cbsz)
699: /* NOTREACHED */
700: err(2, "File read software error 1");
701:
702: if (remsz > (READ_CHUNK >> 1)) {
703: search_start = fr->cbsz - fr->strbeg;
704: fr->cbsz += READ_CHUNK;
705: fr->buffer = sort_realloc(fr->buffer,
706: fr->cbsz);
707: bsz1 = fread(fr->buffer + fr->bsz, 1,
708: READ_CHUNK, fr->file);
709: if (bsz1 == 0) {
710: if (ferror(fr->file))
711: err(2, NULL);
712: break;
713: }
714: fr->bsz += bsz1;
715: remsz += bsz1;
716: } else {
717: if (remsz > 0 && fr->strbeg>0)
718: bcopy(fr->buffer + fr->strbeg,
719: fr->buffer, remsz);
720:
721: fr->strbeg = 0;
722: search_start = remsz;
723: bsz1 = fread(fr->buffer + remsz, 1,
724: fr->cbsz - remsz, fr->file);
725: if (bsz1 == 0) {
726: if (ferror(fr->file))
727: err(2, NULL);
728: break;
729: }
730: fr->bsz = remsz + bsz1;
731: remsz = fr->bsz;
732: }
733: }
734:
735: if (strend == NULL)
736: strend = fr->buffer + fr->bsz;
737:
738: if ((fr->buffer + fr->strbeg <= strend) &&
739: (fr->strbeg < fr->bsz) && (remsz>0))
740: ret = bwscsbdup(fr->buffer + fr->strbeg, strend -
741: fr->buffer - fr->strbeg);
742:
743: fr->strbeg = (strend - fr->buffer) + 1;
744:
745: } else {
746: size_t len = 0;
747:
748: ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag,
749: &(fr->rb));
750: }
751:
752: return ret;
753: }
754:
755: static void
756: file_reader_clean(struct file_reader *fr)
757: {
758:
759: if (fr) {
760: if (fr->mmapaddr)
761: munmap(fr->mmapaddr, fr->mmapsize);
762:
763: if (fr->fd)
764: close(fr->fd);
765:
766: if (fr->buffer)
767: sort_free(fr->buffer);
768:
769: if (fr->file)
770: if (fr->file != stdin)
771: closefile(fr->file, fr->fname);
772:
773: if (fr->fname)
774: sort_free(fr->fname);
775:
776: memset(fr, 0, sizeof(struct file_reader));
777: }
778: }
779:
780: void
781: file_reader_free(struct file_reader *fr)
782: {
783:
784: if (fr) {
785: file_reader_clean(fr);
786: sort_free(fr);
787: }
788: }
789:
790: int
791: procfile(const char *fsrc, struct sort_list *list, struct file_list *fl)
792: {
793: struct file_reader *fr;
794:
795: fr = file_reader_init(fsrc);
796: if (fr == NULL)
797: err(2, "%s", fsrc);
798:
799: /* file browse cycle */
800: for (;;) {
801: struct bwstring *bws;
802:
803: bws = file_reader_readline(fr);
804:
805: if (bws == NULL)
806: break;
807:
808: sort_list_add(list, bws);
809:
810: if (list->memsize >= available_free_memory) {
811: char *fn;
812:
813: fn = new_tmp_file_name();
814: sort_list_to_file(list, fn);
815: file_list_add(fl, fn, false);
816: sort_list_clean(list);
817: }
818: }
819:
820: file_reader_free(fr);
821:
822: return 0;
823: }
824:
825: /*
826: * Compare file headers. Files with EOF always go to the end of the list.
827: */
828: static int
829: file_header_cmp(struct file_header *f1, struct file_header *f2)
830: {
831:
832: if (f1 == f2)
833: return 0;
834: else {
835: if (f1->fr == NULL) {
836: return (f2->fr == NULL) ? 0 : 1;
837: } else if (f2->fr == NULL)
838: return -1;
839: else {
840: int ret;
841:
842: ret = list_coll(&(f1->si), &(f2->si));
843: if (!ret)
844: return (f1->file_pos < f2->file_pos) ? -1 : 1;
845: return ret;
846: }
847: }
848: }
849:
850: /*
851: * Allocate and init file header structure
852: */
853: static void
854: file_header_init(struct file_header **fh, const char *fn, size_t file_pos)
855: {
856:
857: if (fh && fn) {
858: struct bwstring *line;
859:
860: *fh = sort_malloc(sizeof(struct file_header));
861: (*fh)->file_pos = file_pos;
862: (*fh)->fr = file_reader_init(fn);
863: if ((*fh)->fr == NULL) {
864: err(2, "Cannot open %s for reading",
865: strcmp(fn, "-") == 0 ? "stdin" : fn);
866: }
867: line = file_reader_readline((*fh)->fr);
868: if (line == NULL) {
869: file_reader_free((*fh)->fr);
870: (*fh)->fr = NULL;
871: (*fh)->si = NULL;
872: } else {
873: (*fh)->si = sort_list_item_alloc();
874: sort_list_item_set((*fh)->si, line);
875: }
876: }
877: }
878:
879: /*
880: * Close file
881: */
882: static void
883: file_header_close(struct file_header **fh)
884: {
885:
886: if (fh && *fh) {
887: if ((*fh)->fr) {
888: file_reader_free((*fh)->fr);
889: (*fh)->fr = NULL;
890: }
891: if ((*fh)->si) {
892: sort_list_item_clean((*fh)->si);
893: sort_free((*fh)->si);
894: (*fh)->si = NULL;
895: }
896: sort_free(*fh);
897: *fh = NULL;
898: }
899: }
900:
901: /*
902: * Swap two array elements
903: */
904: static void
905: file_header_swap(struct file_header **fh, size_t i1, size_t i2)
906: {
907: struct file_header *tmp;
908:
909: tmp = fh[i1];
910: fh[i1] = fh[i2];
911: fh[i2] = tmp;
912: }
913:
914: /* heap algorithm ==>> */
915:
916: /*
917: * See heap sort algorithm
918: * "Raises" last element to its right place
919: */
920: static void
921: file_header_heap_swim(struct file_header **fh, size_t indx)
922: {
923:
924: if (indx > 0) {
925: size_t parent_index;
926:
927: parent_index = (indx - 1) >> 1;
928:
929: if (file_header_cmp(fh[indx], fh[parent_index]) < 0) {
930: /* swap child and parent and continue */
931: file_header_swap(fh, indx, parent_index);
932: file_header_heap_swim(fh, parent_index);
933: }
934: }
935: }
936:
937: /*
938: * Sink the top element to its correct position
939: */
940: static void
941: file_header_heap_sink(struct file_header **fh, size_t indx, size_t size)
942: {
943: size_t left_child_index;
944: size_t right_child_index;
945:
946: left_child_index = indx + indx + 1;
947: right_child_index = left_child_index + 1;
948:
949: if (left_child_index < size) {
950: size_t min_child_index;
951:
952: min_child_index = left_child_index;
953:
954: if ((right_child_index < size) &&
955: (file_header_cmp(fh[left_child_index],
956: fh[right_child_index]) > 0))
957: min_child_index = right_child_index;
958: if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) {
959: file_header_swap(fh, indx, min_child_index);
960: file_header_heap_sink(fh, min_child_index, size);
961: }
962: }
963: }
964:
965: /* <<== heap algorithm */
966:
967: /*
968: * Adds element to the "left" end
969: */
970: static void
971: file_header_list_rearrange_from_header(struct file_header **fh, size_t size)
972: {
973:
974: file_header_heap_sink(fh, 0, size);
975: }
976:
977: /*
978: * Adds element to the "right" end
979: */
980: static void
981: file_header_list_push(struct file_header *f, struct file_header **fh, size_t size)
982: {
983:
984: fh[size++] = f;
985: file_header_heap_swim(fh, size - 1);
986: }
987:
988: struct last_printed
989: {
990: struct bwstring *str;
991: };
992:
993: /*
994: * Prints the current line of the file
995: */
996: static void
997: file_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp)
998: {
999:
1000: if (fh && fh->fr && f_out && fh->si && fh->si->str) {
1001: if (sort_opts_vals.uflag) {
1002: if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) {
1003: bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag);
1004: if (lp->str)
1005: bwsfree(lp->str);
1006: lp->str = bwsdup(fh->si->str);
1007: }
1008: } else
1009: bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag);
1010: }
1011: }
1012:
1013: /*
1014: * Read next line
1015: */
1016: static void
1017: file_header_read_next(struct file_header *fh)
1018: {
1019:
1020: if (fh && fh->fr) {
1021: struct bwstring *tmp;
1022:
1023: tmp = file_reader_readline(fh->fr);
1024: if (tmp == NULL) {
1025: file_reader_free(fh->fr);
1026: fh->fr = NULL;
1027: if (fh->si) {
1028: sort_list_item_clean(fh->si);
1029: sort_free(fh->si);
1030: fh->si = NULL;
1031: }
1032: } else {
1033: if (fh->si == NULL)
1034: fh->si = sort_list_item_alloc();
1035: sort_list_item_set(fh->si, tmp);
1036: }
1037: }
1038: }
1039:
1040: /*
1041: * Merge array of "files headers"
1042: */
1043: static void
1044: file_headers_merge(size_t fnum, struct file_header **fh, FILE *f_out)
1045: {
1046: struct last_printed lp;
1047: size_t i;
1048:
1049: memset(&lp, 0, sizeof(lp));
1050:
1051: /*
1052: * construct the initial sort structure
1053: */
1054: for (i = 0; i < fnum; i++)
1055: file_header_list_push(fh[i], fh, i);
1056:
1057: while (fh[0]->fr) { /* unfinished files are always in front */
1058: /* output the smallest line: */
1059: file_header_print(fh[0], f_out, &lp);
1060: /* read a new line, if possible: */
1061: file_header_read_next(fh[0]);
1062: /* re-arrange the list: */
1063: file_header_list_rearrange_from_header(fh, fnum);
1064: }
1065:
1066: if (lp.str)
1067: bwsfree(lp.str);
1068: }
1069:
1070: /*
1071: * Merges the given files into the output file, which can be
1072: * stdout.
1073: */
1074: static void
1075: merge_files_array(size_t argc, char **argv, const char *fn_out)
1076: {
1077:
1078: if (argv && fn_out) {
1079: struct file_header **fh;
1080: FILE *f_out;
1081: size_t i;
1082:
1083: f_out = openfile(fn_out, "w");
1084:
1085: if (f_out == NULL)
1086: err(2, "%s", fn_out);
1087:
1088: fh = sort_malloc((argc + 1) * sizeof(struct file_header *));
1089:
1090: for (i = 0; i < argc; i++)
1091: file_header_init(fh + i, argv[i], (size_t) i);
1092:
1093: file_headers_merge(argc, fh, f_out);
1094:
1095: for (i = 0; i < argc; i++)
1096: file_header_close(fh + i);
1097:
1098: sort_free(fh);
1099:
1100: closefile(f_out, fn_out);
1101: }
1102: }
1103:
1104: /*
1105: * Shrinks the file list until its size smaller than max number of opened files
1106: */
1107: static int
1108: shrink_file_list(struct file_list *fl)
1109: {
1110:
1111: if (fl == NULL || (size_t)fl->count < max_open_files)
1112: return 0;
1113: else {
1114: struct file_list new_fl;
1115: size_t indx = 0;
1116:
1117: file_list_init(&new_fl, true);
1118: while (indx < fl->count) {
1119: char *fnew;
1120: size_t num;
1121:
1122: num = fl->count - indx;
1123: fnew = new_tmp_file_name();
1124:
1125: if ((size_t) num >= max_open_files)
1126: num = max_open_files - 1;
1127: merge_files_array(num, fl->fns + indx, fnew);
1128: if (fl->tmp) {
1129: size_t i;
1130:
1131: for (i = 0; i < num; i++)
1132: unlink(fl->fns[indx + i]);
1133: }
1134: file_list_add(&new_fl, fnew, false);
1135: indx += num;
1136: }
1137: fl->tmp = false; /* already taken care of */
1138: file_list_clean(fl);
1139:
1140: fl->count = new_fl.count;
1141: fl->fns = new_fl.fns;
1142: fl->sz = new_fl.sz;
1143: fl->tmp = new_fl.tmp;
1144:
1145: return 1;
1146: }
1147: }
1148:
1149: /*
1150: * Merge list of files
1151: */
1152: void
1153: merge_files(struct file_list *fl, const char *fn_out)
1154: {
1155:
1156: if (fl && fn_out) {
1157: while (shrink_file_list(fl));
1158:
1159: merge_files_array(fl->count, fl->fns, fn_out);
1160: }
1161: }
1162:
1163: static const char *
1164: get_sort_method_name(int sm)
1165: {
1166:
1167: if (sm == SORT_MERGESORT)
1168: return "mergesort";
1169: else if (sort_opts_vals.sort_method == SORT_RADIXSORT)
1170: return "radixsort";
1171: else if (sort_opts_vals.sort_method == SORT_HEAPSORT)
1172: return "heapsort";
1173: else
1174: return "quicksort";
1175: }
1176:
1177: /*
1178: * Sort list of lines and writes it to the file
1179: */
1180: void
1181: sort_list_to_file(struct sort_list *list, const char *outfile)
1182: {
1183: struct sort_mods *sm = &(keys[0].sm);
1184:
1185: if (!sm->Mflag && !sm->Rflag && !sm->Vflag &&
1186: !sm->gflag && !sm->hflag && !sm->nflag) {
1187: if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort)
1188: sort_opts_vals.sort_method = SORT_RADIXSORT;
1189:
1190: } else if (sort_opts_vals.sort_method == SORT_RADIXSORT)
1191: err(2, "Radix sort cannot be used with these sort options");
1192:
1193: /*
1194: * to handle stable sort and the unique cases in the
1195: * right order, we need stable basic algorithm
1196: */
1197: if (sort_opts_vals.sflag) {
1198: switch (sort_opts_vals.sort_method){
1199: case SORT_MERGESORT:
1200: break;
1201: case SORT_RADIXSORT:
1202: break;
1203: case SORT_DEFAULT:
1204: sort_opts_vals.sort_method = SORT_MERGESORT;
1205: break;
1206: default:
1207: errx(2, "The chosen sort method cannot be used with "
1208: "stable and/or unique sort");
1209: };
1210: }
1211:
1212: if (sort_opts_vals.sort_method == SORT_DEFAULT)
1213: sort_opts_vals.sort_method = DEFAULT_SORT_ALGORITHM;
1214:
1215: if (debug_sort)
1216: printf("sort_method=%s\n",
1217: get_sort_method_name(sort_opts_vals.sort_method));
1218:
1219: switch (sort_opts_vals.sort_method){
1220: case SORT_RADIXSORT:
1221: rxsort(list->list, list->count);
1222: break;
1223: case SORT_MERGESORT:
1224: mergesort(list->list, list->count,
1225: sizeof(struct sort_list_item *), list_coll);
1226: break;
1227: case SORT_HEAPSORT:
1228: heapsort(list->list, list->count,
1229: sizeof(struct sort_list_item *), list_coll);
1230: break;
1231: case SORT_QSORT:
1232: qsort(list->list, list->count,
1233: sizeof(struct sort_list_item *), list_coll);
1234: break;
1235: default:
1236: DEFAULT_SORT_FUNC(list->list, list->count,
1237: sizeof(struct sort_list_item *), list_coll);
1238: break;
1239: }
1240: sort_list_dump(list, outfile);
1241: }