NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
bgzip.c
Go to the documentation of this file.
1 /* The MIT License
2 
3  Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
4 
5  Permission is hereby granted, free of charge, to any person obtaining a copy
6  of this software and associated documentation files (the "Software"), to deal
7  in the Software without restriction, including without limitation the rights
8  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9  copies of the Software, and to permit persons to whom the Software is
10  furnished to do so, subject to the following conditions:
11 
12  The above copyright notice and this permission notice shall be included in
13  all copies or substantial portions of the Software.
14 
15  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21  THE SOFTWARE.
22 */
23 
24 #include <stdlib.h>
25 #include <string.h>
26 #include <stdio.h>
27 #include <fcntl.h>
28 #include <unistd.h>
29 #include <errno.h>
30 #include <stdarg.h>
31 #include <getopt.h>
32 #include <sys/select.h>
33 #include <sys/stat.h>
34 #include "htslib/bgzf.h"
35 #include "htslib/hts.h"
36 
37 static const int WINDOW_SIZE = 64 * 1024;
38 
39 static void error(const char *format, ...)
40 {
41  va_list ap;
42  va_start(ap, format);
43  vfprintf(stderr, format, ap);
44  va_end(ap);
45  exit(EXIT_FAILURE);
46 }
47 
48 static int write_open(const char *fn, int is_forced)
49 {
50  int fd = -1;
51  char c;
52  if (!is_forced) {
53  if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
54  fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
55  if ( scanf("%c", &c) != 1 ) c = 'n';
56  if (c != 'Y' && c != 'y') {
57  fprintf(stderr, "[bgzip] not overwritten\n");
58  exit(EXIT_FAILURE);
59  }
60  }
61  }
62  if (fd < 0) {
63  if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
64  fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
65  exit(EXIT_FAILURE);
66  }
67  }
68  return fd;
69 }
70 
71 static int bgzip_main_usage(void)
72 {
73  fprintf(stderr, "\n");
74  fprintf(stderr, "Version: %s\n", hts_version());
75  fprintf(stderr, "Usage: bgzip [OPTIONS] [FILE] ...\n");
76  fprintf(stderr, "Options:\n");
77  fprintf(stderr, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n");
78  fprintf(stderr, " -c, --stdout write on standard output, keep original files unchanged\n");
79  fprintf(stderr, " -d, --decompress decompress\n");
80  fprintf(stderr, " -f, --force overwrite files without asking\n");
81  fprintf(stderr, " -h, --help give this help\n");
82  fprintf(stderr, " -i, --index compress and create BGZF index\n");
83  fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n");
84  fprintf(stderr, " -r, --reindex (re)index compressed file\n");
85  fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n");
86  fprintf(stderr, "\n");
87  return 1;
88 }
89 
90 int main(int argc, char **argv)
91 {
92  int c, compress, pstdout, is_forced, index = 0, reindex = 0;
93  BGZF *fp;
94  void *buffer;
95  long start, end, size;
96  char *index_fname = NULL;
97 
98  static struct option loptions[] =
99  {
100  {"help",0,0,'h'},
101  {"offset",1,0,'b'},
102  {"stdout",0,0,'c'},
103  {"decompress",0,0,'d'},
104  {"force",0,0,'f'},
105  {"index",0,0,'i'},
106  {"index-name",1,0,'I'},
107  {"reindex",0,0,'r'},
108  {"size",1,0,'s'},
109  {0,0,0,0}
110  };
111 
112  compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
113  while((c = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){
114  switch(c){
115  case 'd': compress = 0; break;
116  case 'c': pstdout = 1; break;
117  case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
118  case 's': size = atol(optarg); pstdout = 1; break;
119  case 'f': is_forced = 1; break;
120  case 'i': index = 1; break;
121  case 'I': index_fname = optarg; break;
122  case 'r': reindex = 1; compress = 0; break;
123  case 'h':
124  case '?': return bgzip_main_usage();
125  }
126  }
127  if (size >= 0) end = start + size;
128  if (end >= 0 && end < start) {
129  fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
130  return 1;
131  }
132  if (compress == 1) {
133  struct stat sbuf;
134  int f_src = fileno(stdin);
135  int f_dst = fileno(stdout);
136 
137  if ( argc>optind )
138  {
139  if ( stat(argv[optind],&sbuf)<0 )
140  {
141  fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
142  return 1;
143  }
144 
145  if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
146  fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
147  return 1;
148  }
149 
150  if (pstdout)
151  f_dst = fileno(stdout);
152  else
153  {
154  char *name = malloc(strlen(argv[optind]) + 5);
155  strcpy(name, argv[optind]);
156  strcat(name, ".gz");
157  f_dst = write_open(name, is_forced);
158  if (f_dst < 0) return 1;
159  free(name);
160  }
161  }
162  else if (!pstdout && isatty(fileno((FILE *)stdout)) )
163  return bgzip_main_usage();
164  else if ( index && !index_fname )
165  {
166  fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
167  return 1;
168  }
169 
170  fp = bgzf_fdopen(f_dst, "w");
171  if ( index ) bgzf_index_build_init(fp);
172  buffer = malloc(WINDOW_SIZE);
173  while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
174  if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
175  // f_dst will be closed here
176  if ( index )
177  {
178  if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL);
179  else bgzf_index_dump(fp, argv[optind], ".gz.gzi");
180  }
181  if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
182  if (argc > optind && !pstdout) unlink(argv[optind]);
183  free(buffer);
184  close(f_src);
185  return 0;
186  }
187  else if ( reindex )
188  {
189  if ( argc>optind )
190  {
191  fp = bgzf_open(argv[optind], "r");
192  if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
193  }
194  else
195  {
196  if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
197  fp = bgzf_fdopen(fileno(stdin), "r");
198  if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
199  }
200 
201  buffer = malloc(BGZF_BLOCK_SIZE);
203  int ret;
204  while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
205  free(buffer);
206  if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
207 
208  if ( index_fname )
209  bgzf_index_dump(fp, index_fname, NULL);
210  else
211  bgzf_index_dump(fp, argv[optind], ".gzi");
212 
213  if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
214  return 0;
215  }
216  else
217  {
218  struct stat sbuf;
219  int f_dst;
220 
221  if ( argc>optind )
222  {
223  if ( stat(argv[optind],&sbuf)<0 )
224  {
225  fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
226  return 1;
227  }
228  char *name;
229  int len = strlen(argv[optind]);
230  if ( strcmp(argv[optind]+len-3,".gz") )
231  {
232  fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
233  return 1;
234  }
235  fp = bgzf_open(argv[optind], "r");
236  if (fp == NULL) {
237  fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
238  return 1;
239  }
240 
241  if (pstdout) {
242  f_dst = fileno(stdout);
243  }
244  else {
245  name = strdup(argv[optind]);
246  name[strlen(name) - 3] = '\0';
247  f_dst = write_open(name, is_forced);
248  free(name);
249  }
250  }
251  else if (!pstdout && isatty(fileno((FILE *)stdin)) )
252  return bgzip_main_usage();
253  else
254  {
255  f_dst = fileno(stdout);
256  fp = bgzf_fdopen(fileno(stdin), "r");
257  if (fp == NULL) {
258  fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
259  return 1;
260  }
261  }
262  buffer = malloc(WINDOW_SIZE);
263  if ( start>0 )
264  {
265  if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
266  if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
267  }
268  while (1) {
269  if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
270  else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
271  if (c == 0) break;
272  if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
273  start += c;
274  if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c);
275  if (end >= 0 && start >= end) break;
276  }
277  free(buffer);
278  if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
279  if (!pstdout) unlink(argv[optind]);
280  return 0;
281  }
282  return 0;
283 }