NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tabix.c
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <unistd.h>
4 #include <string.h>
5 #include <getopt.h>
6 #include <sys/types.h>
7 #include <sys/stat.h>
8 #include <errno.h>
9 #include "htslib/tbx.h"
10 #include "htslib/sam.h"
11 #include "htslib/vcf.h"
12 #include "htslib/kseq.h"
13 #include "htslib/bgzf.h"
14 #include "htslib/hts.h"
15 
16 typedef struct
17 {
18  int min_shift;
19 }
20 args_t;
21 
22 static void error(const char *format, ...)
23 {
24  va_list ap;
25  va_start(ap, format);
26  vfprintf(stderr, format, ap);
27  va_end(ap);
28  exit(EXIT_FAILURE);
29 }
30 
31 
32 #define IS_GFF (1<<0)
33 #define IS_BED (1<<1)
34 #define IS_SAM (1<<2)
35 #define IS_VCF (1<<3)
36 #define IS_BCF (1<<4)
37 #define IS_BAM (1<<5)
38 #define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
39 
40 int file_type(const char *fname)
41 {
42  int l = strlen(fname);
43  int strcasecmp(const char *s1, const char *s2);
44  if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
45  else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
46  else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
47  else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
48  else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
49  else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
50  return 0;
51 }
52 
53 #define PRINT_HEADER 1
54 #define HEADER_ONLY 2
55 static int query_regions(char **argv, int argc, int mode)
56 {
57  char *fname = argv[0];
58  int i, ftype = file_type(fname);
59 
60  if ( ftype & IS_TXT || !ftype )
61  {
62  htsFile *fp = hts_open(fname,"r");
63  if ( !fp ) error("Could not read %s\n", fname);
64  tbx_t *tbx = tbx_index_load(fname);
65  if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
66  kstring_t str = {0,0,0};
67  if ( mode )
68  {
69  while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
70  {
71  if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
72  puts(str.s);
73  }
74  }
75  if ( mode!=HEADER_ONLY )
76  {
77  for (i=1; i<argc; i++)
78  {
79  hts_itr_t *itr = tbx_itr_querys(tbx, argv[i]);
80  if ( !itr ) continue;
81  while (tbx_itr_next(fp, tbx, itr, &str) >= 0) puts(str.s);
82  tbx_itr_destroy(itr);
83  }
84  }
85  free(str.s);
86  if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
87  tbx_destroy(tbx);
88  }
89  else if ( ftype==IS_BCF ) // output uncompressed VCF
90  {
91  htsFile *fp = hts_open(fname,"r");
92  if ( !fp ) error("Could not read %s\n", fname);
93  htsFile *out = hts_open("-","w");
94  if ( !out ) error("Could not open stdout\n", fname);
95  hts_idx_t *idx = bcf_index_load(fname);
96  if ( !idx ) error("Could not load .csi index of %s\n", fname);
97  bcf_hdr_t *hdr = bcf_hdr_read(fp);
98  if ( !hdr ) error("Could not read the header: %s\n", fname);
99  if ( mode )
100  {
101  bcf_hdr_write(out,hdr);
102  }
103  if ( mode!=HEADER_ONLY )
104  {
105  bcf1_t *rec = bcf_init();
106  for (i=1; i<argc; i++)
107  {
108  hts_itr_t *itr = bcf_itr_querys(idx,hdr,argv[i]);
109  if ( !itr ) continue;
110  while ( bcf_itr_next(fp, itr, rec) >=0 ) bcf_write(out,hdr,rec);
111  tbx_itr_destroy(itr);
112  }
113  bcf_destroy(rec);
114  }
115  if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
116  if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
117  bcf_hdr_destroy(hdr);
118  hts_idx_destroy(idx);
119  }
120  else if ( ftype==IS_BAM ) // todo: BAM
121  error("Please use \"samtools view\" for querying BAM files.\n");
122  return 0;
123 }
124 static int query_chroms(char *fname)
125 {
126  const char **seq;
127  int i, nseq, ftype = file_type(fname);
128  if ( ftype & IS_TXT || !ftype )
129  {
130  tbx_t *tbx = tbx_index_load(fname);
131  if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
132  seq = tbx_seqnames(tbx, &nseq);
133  for (i=0; i<nseq; i++)
134  printf("%s\n", seq[i]);
135  free(seq);
136  tbx_destroy(tbx);
137  }
138  else if ( ftype==IS_BCF )
139  {
140  htsFile *fp = hts_open(fname,"r");
141  if ( !fp ) error("Could not read %s\n", fname);
142  bcf_hdr_t *hdr = bcf_hdr_read(fp);
143  if ( !hdr ) error("Could not read the header: %s\n", fname);
144  hts_close(fp);
145  hts_idx_t *idx = bcf_index_load(fname);
146  if ( !idx ) error("Could not load .csi index of %s\n", fname);
147  seq = bcf_index_seqnames(idx, hdr, &nseq);
148  for (i=0; i<nseq; i++)
149  printf("%s\n", seq[i]);
150  free(seq);
151  bcf_hdr_destroy(hdr);
152  hts_idx_destroy(idx);
153  }
154  else if ( ftype==IS_BAM ) // todo: BAM
155  error("BAM: todo\n");
156  return 0;
157 }
158 
159 int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
160 {
161  if ( ftype & IS_TXT || !ftype )
162  {
163  BGZF *fp = bgzf_open(fname,"r");
164  if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
165 
166  char *buffer = fp->uncompressed_block;
167  int skip_until = 0;
168 
169  // Skip the header: find out the position of the data block
170  if ( buffer[0]==conf->meta_char )
171  {
172  skip_until = 1;
173  while (1)
174  {
175  if ( buffer[skip_until]=='\n' )
176  {
177  skip_until++;
178  if ( skip_until>=fp->block_length )
179  {
180  if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
181  skip_until = 0;
182  }
183  // The header has finished
184  if ( buffer[skip_until]!=conf->meta_char ) break;
185  }
186  skip_until++;
187  if ( skip_until>=fp->block_length )
188  {
189  if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
190  skip_until = 0;
191  }
192  }
193  }
194 
195  // Output the new header
196  FILE *hdr = fopen(header,"r");
197  if ( !hdr ) error("%s: %s", header,strerror(errno));
198  int page_size = getpagesize();
199  char *buf = valloc(page_size);
200  BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
201  ssize_t nread;
202  while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
203  {
204  if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
205  if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
206  }
207  if ( fclose(hdr) ) error("close failed: %s\n", header);
208 
209  // Output all remainig data read with the header block
210  if ( fp->block_length - skip_until > 0 )
211  {
212  if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
213  }
214  if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
215 
216  while (1)
217  {
218  nread = bgzf_raw_read(fp, buf, page_size);
219  if ( nread<=0 ) break;
220 
221  int count = bgzf_raw_write(bgzf_out, buf, nread);
222  if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
223  }
224  if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
225  if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
226  }
227  else
228  error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header.
229  return 0;
230 }
231 
232 static int usage(void)
233 {
234  fprintf(stderr, "\n");
235  fprintf(stderr, "Version: %s\n", hts_version());
236  fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n");
237  fprintf(stderr, "Options:\n");
238  fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
239  fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
240  fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
241  fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
242  fprintf(stderr, " -f, --force overwrite existing index without asking\n");
243  fprintf(stderr, " -h, --print-header print also the header lines\n");
244  fprintf(stderr, " -H, --only-header print only the header lines\n");
245  fprintf(stderr, " -l, --list-chroms list chromosome names\n");
246  fprintf(stderr, " -m, --min-shift INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
247  fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf, bcf, bam\n");
248  fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
249  fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
250  fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
251  fprintf(stderr, "\n");
252  return 1;
253 }
254 
255 int main(int argc, char *argv[])
256 {
257  int c, min_shift = -1, is_force = 0, list_chroms = 0, mode = 0;
258  tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
259  char *reheader = NULL;
260 
261  static struct option loptions[] =
262  {
263  {"help",0,0,'h'},
264  {"zero-based",0,0,'0'},
265  {"print-header",0,0,'h'},
266  {"only-header",0,0,'H'},
267  {"begin",1,0,'b'},
268  {"comment",1,0,'c'},
269  {"end",1,0,'e'},
270  {"force",0,0,'f'},
271  {"preset",1,0,'p'},
272  {"sequence",1,0,'s'},
273  {"skip-lines",1,0,'S'},
274  {"list-chroms",0,0,'l'},
275  {"reheader",1,0,'r'},
276  {0,0,0,0}
277  };
278 
279  while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:", loptions,NULL)) >= 0)
280  {
281  switch (c)
282  {
283  case 'r': reheader = optarg; break;
284  case 'h': mode = PRINT_HEADER; break;
285  case 'H': mode = HEADER_ONLY; break;
286  case 'l': list_chroms = 1; break;
287  case '0': conf.preset |= TBX_UCSC; break;
288  case 'b': conf.bc = atoi(optarg); break;
289  case 'e': conf.ec = atoi(optarg); break;
290  case 'c': conf.meta_char = *optarg; break;
291  case 'f': is_force = 1; break;
292  case 'm': min_shift = atoi(optarg); break;
293  case 'p':
294  if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
295  else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
296  else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
297  else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
298  else error("The preset string not recognised: '%s'\n", optarg);
299  break;
300  case 's': conf.sc = atoi(optarg); break;
301  case 'S': conf.line_skip = atoi(optarg); break;
302  default: return usage();
303  }
304  }
305 
306  if ( optind==argc ) return usage();
307 
308  if ( list_chroms )
309  return query_chroms(argv[optind]);
310 
311  if ( argc > optind+1 || mode==HEADER_ONLY )
312  return query_regions(&argv[optind], argc-optind, mode);
313 
314  char *fname = argv[optind];
315  int ftype = file_type(fname);
316  if ( !conf_ptr ) // no preset given
317  {
318  if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff;
319  else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed;
320  else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam;
321  else if ( ftype==IS_VCF ) conf_ptr = &tbx_conf_vcf;
322  else if ( ftype==IS_BCF )
323  {
324  if ( min_shift <= 0 ) min_shift = 14;
325  }
326  else if ( ftype==IS_BAM )
327  {
328  if ( min_shift <= 0 ) min_shift = 14;
329  }
330  }
331  if ( reheader )
332  return reheader_file(fname, reheader, ftype, conf_ptr);
333 
334  if ( conf_ptr )
335  conf = *conf_ptr;
336 
337  char *suffix = min_shift <= 0 ? ".tbi" : (ftype==IS_BAM ? ".bai" : ".csi");
338  char *idx_fname = calloc(strlen(fname) + 5, 1);
339  strcat(strcpy(idx_fname, fname), suffix);
340 
341  struct stat stat_tbi, stat_file;
342  if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
343  {
344  // Before complaining about existing index, check if the VCF file isn't
345  // newer. This is a common source of errors, people tend not to notice
346  // that tabix failed
347  stat(fname, &stat_file);
348  if ( stat_file.st_mtime <= stat_tbi.st_mtime )
349  error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
350  }
351  free(idx_fname);
352 
353  if ( min_shift > 0 ) // CSI index
354  {
355  if ( ftype==IS_BCF )
356  {
357  if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
358  return 0;
359  }
360  if ( ftype==IS_BAM )
361  {
362  if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
363  return 0;
364  }
365  if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
366  return 0;
367  }
368  else
369  {
370  if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
371  return 0;
372  }
373  return 0;
374 }