12 #define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
27 static void _regions_add(
bcf_sr_regions_t *reg,
const char *chr,
int start,
int end);
31 static int *init_filters(
bcf_hdr_t *hdr,
const char *filters,
int *nfilters)
34 const char *tmp = filters, *prev = filters;
35 int nout = 0, *out = NULL;
38 if ( *tmp==
',' || !*tmp )
40 out = (
int*) realloc(out,
sizeof(
int));
41 if ( tmp-prev==1 && *prev==
'.' )
46 kputsn(prev, tmp-prev, &str);
55 if ( str.
m ) free(str.
s);
65 fprintf(stderr,
"[%s:%d %s] Error: bcf_sr_set_regions() must be called before bcf_sr_add_reader()\n", __FILE__,__LINE__,__FUNCTION__);
69 if ( !readers->
regions )
return -1;
78 if ( !readers->
targets )
return -1;
92 if ( !reader->
file )
return 0;
104 fprintf(stderr,
"[add_reader] Could not load the index of %s\n", fname);
117 fprintf(stderr,
"[add_reader] Could not load the index of %s\n", fname);
123 fprintf(stderr,
"Index required, expected .vcf.gz or .bcf file: %s\n", fname);
139 fprintf(stderr,
"File type not recognised: %s\n", fname);
146 fprintf(stderr,
"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->
nreaders);
151 fprintf(stderr,
"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
154 if ( !reader->
header )
return 0;
156 reader->
fname = fname;
168 files->
regions = _regions_init_string(names[i]);
170 _regions_add(files->
regions, names[i], -1, -1);
184 static void bcf_sr_destroy1(
bcf_sr_t *reader)
192 for (j=0; j<reader->
mbuffer; j++)
202 bcf_sr_destroy1(&files->
readers[i]);
216 bcf_sr_destroy1(&files->
readers[i]);
235 int irec,jrec, has_snp=0, has_indel=0, has_any=0;
236 for (irec=1; irec<=reader->
nbuffer; irec++)
242 if ( !has_any ) has_any = 1;
248 if ( !has_snp ) has_snp = 1;
253 if ( !has_indel ) has_indel = 1;
259 while ( irec<=reader->nbuffer && jrec<=reader->nbuffer )
261 if ( reader->
buffer[irec]->
pos != -1 ) { irec++;
continue; }
262 if ( jrec<=irec ) jrec = irec+1;
263 while ( jrec<=reader->nbuffer && reader->
buffer[jrec]->
pos==-1 ) jrec++;
264 if ( jrec<=reader->nbuffer )
275 for (j=0; j<=reader->
nbuffer; j++)
305 for (i=0; i<line->
d.
n_flt; i++)
313 static int _reader_seek(
bcf_sr_t *reader,
const char *seq,
int start,
int end)
317 fprintf(stderr,
"The coordinate is out of csi index limit: %d\n", end+1);
329 if ( tid==-1 )
return -1;
334 int tid = bcf_hdr_name2id(reader->
header, seq);
335 if ( tid==-1 )
return -1;
346 static int _readers_next_region(
bcf_srs_t *files)
409 fprintf(stderr,
"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
430 if ( !has_filter(reader, reader->
buffer[reader->
nbuffer+1]) )
continue;
443 collapse_buffer(files, reader);
449 static void _reader_shift_buffer(
bcf_sr_t *reader)
452 for (i=2; i<=reader->
nbuffer; i++)
454 if ( i<=reader->nbuffer )
480 for (i=1; i<=reader->
nbuffer; i++)
496 if ( tmpl->
rlen != line->
rlen )
continue;
506 for (ial=1; ial<tmpl->
n_allele; ial++)
508 for (jal=1; jal<line->
n_allele; jal++)
509 if ( !strcmp(tmpl->
d.
allele[ial], line->
d.
allele[jal]) ) { nmatch++;
break; }
511 if ( nmatch==tmpl->
n_allele ) { irec=i;
break; }
518 for (ial=1; ial<tmpl->
n_allele; ial++)
520 for (jal=1; jal<line->
n_allele; jal++)
521 if ( !strcmp(tmpl->
d.
allele[ial], line->
d.
allele[jal]) ) { irec=i;
break; }
522 if ( irec>=1 )
break;
524 if ( irec>=1 )
break;
526 if ( irec==-1 )
return -1;
542 int i, min_pos = INT_MAX;
548 if ( files->
regions && _readers_next_region(files)<0 )
break;
551 const char *chr = NULL;
554 _reader_fill_buffer(files, &files->
readers[i]);
564 if ( min_pos==INT_MAX )
578 _reader_shift_buffer(&files->
readers[i]);
600 if ( _reader_match_alleles(files, &files->
readers[i], first) < 0 )
continue;
617 if ( !ret )
return ret;
628 if ( !files->
has_line[i] )
continue;
632 if ( i==files->
nreaders )
return ret;
636 static void bcf_sr_seek_start(
bcf_srs_t *readers)
640 for (i=0; i<reg->
nseqs; i++)
651 bcf_sr_seek_start(readers);
666 int i, j, nsmpl, free_smpl = 0;
669 void *exclude = (fname[0]==
'^') ? khash_str2int_init() : NULL;
670 if ( exclude || strcmp(
"-",fname) )
675 fprintf(stderr,
"Could not read the file: \"%s\"\n", fname);
680 for (i=0; i<nsmpl; i++)
681 khash_str2int_inc(exclude, smpl[i]);
693 for (i=0; i<nsmpl; i++)
695 if ( exclude && khash_str2int_has_key(exclude,smpl[i]) )
continue;
705 fprintf(stderr,
"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->
readers[n_isec].
fname);
713 if ( exclude ) khash_str2int_destroy(exclude);
716 for (i=0; i<nsmpl; i++) free(smpl[i]);
723 fprintf(stderr,
"No samples in common.\n");
731 for (j=0; j<files->
n_smpl; j++)
739 static void _regions_add(
bcf_sr_regions_t *reg,
const char *chr,
int start,
int end)
741 if ( start==-1 && end==-1 )
751 reg->
seq_hash = khash_str2int_init();
754 if ( khash_str2int_get(reg->
seq_hash, chr, &iseq)<0 )
773 if ( start < creg->regs[i].start )
max = i - 1;
774 else if ( start > creg->
regs[i].
start ) min = i + 1;
781 if ( ++max < creg->nregs )
797 const char *sp = str, *ep = str;
801 while ( *ep && *ep!=
',' && *ep!=
':' ) ep++;
803 kputsn(sp,ep-sp,&tmp);
807 from = strtol(sp,(
char**)&ep,10);
810 fprintf(stderr,
"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
811 free(reg); free(tmp.
s);
return NULL;
813 if ( !*ep || *ep==
',' )
815 _regions_add(reg, tmp.
s, from, from);
821 fprintf(stderr,
"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
822 free(reg); free(tmp.
s);
return NULL;
826 to = strtol(sp,(
char**)&ep,10);
827 if ( *ep && *ep!=
',' )
829 fprintf(stderr,
"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
830 free(reg); free(tmp.
s);
return NULL;
833 _regions_add(reg, tmp.
s, from, to);
839 if ( tmp.
l ) _regions_add(reg, tmp.
s, -1, -1);
850 static int _regions_parse_line(
char *line,
int ichr,
int ifrom,
int ito,
char **chr,
char **chr_end,
int *from,
int *to)
854 if ( line[0]==
'#' )
return 0;
863 char *se = line, *ss = NULL;
865 for (i=0; i<=k && *se; i++)
867 ss = i==0 ? se++ : ++se;
868 while (*se && *se!=
'\t') se++;
870 if ( i<=k )
return -1;
873 *from = *to = strtol(ss, &tmp, 10);
874 if ( tmp==ss )
return -1;
879 *from = strtol(ss, &tmp, 10);
881 *to = strtol(ss, &tmp, 10);
882 if ( ss==tmp )
return -1;
884 for (i=k; i<l && *se; i++)
887 while (*se && *se!=
'\t') se++;
889 if ( i<l )
return -1;
891 *to = strtol(ss, &tmp, 10);
893 *from = strtol(ss, &tmp, 10);
894 if ( ss==tmp )
return -1;
898 for (i=0; i<=ichr && *se; i++)
900 if ( i>0 ) ss = ++se;
901 while (*se && *se!=
'\t') se++;
903 if ( i<=ichr )
return -1;
912 if ( !is_file )
return _regions_init_string(regions);
921 fprintf(stderr,
"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
929 int len = strlen(regions);
930 int is_bed = strcasecmp(
".bed",regions+len-4) ? 0 : 1;
931 if ( !is_bed && !strcasecmp(
".bed.gz",regions+len-7) ) is_bed = 1;
933 if ( ft_type &
FT_VCF ) ito = 1;
940 ret = _regions_parse_line(reg->
line.
s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
944 ret = _regions_parse_line(reg->
line.
s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
947 fprintf(stderr,
"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
952 if ( !ret )
continue;
953 if ( is_bed ) from++;
955 _regions_add(reg, chr, from, to);
959 if ( !reg->
nseqs ) { free(reg);
return NULL; }
965 reg->
seq_hash = khash_str2int_init();
967 for (i=0; i<reg->
nseqs; i++)
971 reg->
fname = strdup(regions);
983 if ( reg->
als ) free(reg->
als);
989 for (i=0; i<reg->
nseqs; i++)
997 khash_str2int_destroy(reg->
seq_hash);
1004 if ( khash_str2int_get(reg->
seq_hash, seq, ®->
iseq) < 0 )
return -1;
1007 if ( reg->
regs )
return 0;
1012 if ( reg->
itr )
return 0;
1019 if ( reg->
iseq<0 )
return -1;
1032 if ( reg->
iseq >= reg->
nseqs ) { reg->
iseq = -1;
return -1; }
1040 char *chr, *chr_end;
1041 int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
1047 if ( ito<0 ) ito = ifrom;
1058 if ( ret<0 ) { reg->
iseq = -1;
return -1; }
1070 fprintf(stderr,
"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->
fname);
1080 if ( ret<0 ) { reg->
iseq = -1;
return -1; }
1082 ret = _regions_parse_line(reg->
line.
s, ichr,ifrom,ito, &chr,&chr_end,&from,&to);
1085 fprintf(stderr,
"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->
fname,ichr+1,ifrom+1,ito+1);
1089 if ( is_bed ) from++;
1092 if ( khash_str2int_get(reg->
seq_hash, chr, ®->
iseq)<0 )
1094 fprintf(stderr,
"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->
line.
s);
1099 reg->
start = from - 1;
1106 int i = 0, max_len = 0;
1109 char *ss = reg->
line.
s;
1110 while ( i<als_idx && *ss )
1112 if ( *ss==
'\t' ) i++;
1117 while ( *se && *se!=
'\t' )
1119 if ( *se==
',' ) reg->
nals++;
1130 if ( *se==
'\t' )
break;
1131 if ( *se!=
',' )
continue;
1133 kputsn(ss,se-ss,®->
als_str);
1140 kputsn(ss,se-ss,®->
als_str);
1147 return type & VCF_INDEL ? 1 : 0;
1154 if ( khash_str2int_get(reg->
seq_hash, seq, &iseq)<0 )
return -1;
1165 if ( reg->
prev_seq==iseq && reg->
iseq!=iseq )
return -2;
1169 while ( iseq==reg->
iseq && reg->
end < start )
1172 if ( reg->
iseq != iseq )
return -1;
1175 if ( reg->
start <= end )
return 0;