31 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
34 static inline void fai_insert_index(
faidx_t *idx,
const char *name,
int len,
int line_len,
int line_blen,
uint64_t offset)
39 if (idx->
n == idx->
m) {
40 idx->
m = idx->
m? idx->
m<<1 : 16;
41 idx->
name = (
char**)realloc(idx->
name,
sizeof(
char*) * idx->
m);
43 idx->
name[idx->
n] = strdup(name);
54 int line_len, line_blen, state;
62 name = 0; l_name = m_name = 0;
63 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
69 }
else if ((state == 0 && len < 0) || state == 2)
continue;
73 fai_insert_index(idx, name, len, line_len, line_blen, offset);
75 while ( (c=
bgzf_getc(bgzf))>=0 && !isspace(c)) {
76 if (m_name < l_name + 2) {
79 name = (
char*)realloc(name, m_name);
85 fprintf(stderr,
"[fai_build_core] the last entry has no sequence\n");
89 if (c !=
'\n')
while ( (c=
bgzf_getc(bgzf))>=0 && c !=
'\n');
94 fprintf(stderr,
"[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
98 if (state == 2) state = 3;
102 if (isgraph(c)) ++l2;
103 }
while ( (c=
bgzf_getc(bgzf))>=0 && c !=
'\n');
104 if (state == 3 && l2) {
105 fprintf(stderr,
"[fai_build_core] different line length in sequence '%s'.\n", name);
110 if (state == 1) line_len = l1, line_blen = l2, state = 0;
111 else if (state == 0) {
112 if (l1 != line_len || l2 != line_blen) state = 2;
116 fai_insert_index(idx, name, len, line_len, line_blen, offset);
125 for (i = 0; i < fai->
n; ++i) {
141 int len, line_len, line_blen;
149 buf = (
char*)calloc(0x10000, 1);
150 while (!feof(fp) && fgets(buf, 0x10000, fp)) {
151 for (p = buf; *p && isgraph(*p); ++p);
154 sscanf(p,
"%d%ld%d%d", &len, &offset, &line_blen, &line_len);
156 sscanf(p,
"%d%lld%d%d", &len, &offset, &line_blen, &line_len);
158 fai_insert_index(fai, buf, len, line_len, line_blen, offset);
167 for (i = 0; i < fai->
n; ++i) free(fai->
name[i]);
180 str = (
char*)calloc(strlen(fn) + 5, 1);
181 sprintf(str,
"%s.fai", fn);
184 fprintf(stderr,
"[fai_build] fail to open the FASTA file %s\n",fn);
192 fp = fopen(str,
"wb");
194 fprintf(stderr,
"[fai_build] fail to write FASTA index %s\n",str);
208 const int buf_size = 1 * 1024 * 1024;
212 const char *url = fn;
215 for (p = fn + l - 1; p >= fn; --p)
216 if (*p ==
'/')
break;
226 if (fp_remote == 0) {
227 fprintf(stderr,
"[download_from_remote] fail to open remote file %s\n",url);
230 if ((fp = fopen(fn,
"wb")) == 0) {
231 fprintf(stderr,
"[download_from_remote] fail to create file in the working directory %s\n",fn);
235 buf = (
uint8_t*)calloc(buf_size, 1);
236 while ((l =
knet_read(fp_remote, buf, buf_size)) != 0)
237 fwrite(buf, 1, l, fp);
242 return fopen(fn,
"r");
251 str = (
char*)calloc(strlen(fn) + 5, 1);
252 sprintf(str,
"%s.fai", fn);
255 if (strstr(fn,
"ftp://") == fn || strstr(fn,
"http://") == fn)
260 fprintf(stderr,
"[fai_load] failed to open remote FASTA index %s\n", str);
267 fp = fopen(str,
"rb");
269 fprintf(stderr,
"[fai_load] build FASTA index.\n");
271 fp = fopen(str,
"rb");
273 fprintf(stderr,
"[fai_load] fail to open FASTA index.\n");
284 if (fai->
bgzf == 0) {
285 fprintf(stderr,
"[fai_load] fail to open FASTA file.\n");
292 fprintf(stderr,
"[fai_load] failed to load .gzi index: %s[.gzi]\n", fn);
303 int i, l, k, name_end;
311 name_end = l = strlen(str);
312 s = (
char*)malloc(l+1);
314 for (i = k = 0; i < l; ++i)
315 if (!isspace(str[i])) s[k++] = str[i];
318 for (i = l - 1; i >= 0; --i)
if (s[i] ==
':')
break;
319 if (i >= 0) name_end = i;
322 for (i = name_end + 1; i < l; ++i) {
323 if (s[i] ==
'-') ++n_hyphen;
324 else if (!isdigit(s[i]) && s[i] !=
',')
break;
326 if (i < l || n_hyphen > 1) name_end = l;
334 }
else s[name_end] =
':', name_end = l;
336 }
else iter =
kh_get(s, h, str);
338 fprintf(stderr,
"[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
346 for (i = k = name_end + 1; i < l; ++i)
347 if (s[i] !=
',') s[k++] = s[i];
349 beg = atoi(s + name_end + 1);
350 for (i = name_end + 1; i != k; ++i)
if (s[i] ==
'-')
break;
351 end = i < k? atoi(s + i + 1) : val.
len;
353 }
else beg = 0, end = val.
len;
354 if (beg >= val.
len) beg = val.
len;
355 if (end >= val.
len) end = val.
len;
356 if (beg > end) beg = end;
364 fprintf(stderr,
"[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
368 s = (
char*)malloc(end - beg + 2);
370 if (isgraph(c)) s[l++] = c;
390 iter =
kh_get(s, fai->hash, c_name);
391 if (iter ==
kh_end(fai->hash))
394 fprintf(stderr,
"[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
398 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
399 if(p_beg_i < 0) p_beg_i = 0;
400 else if(val.
len <= p_beg_i) p_beg_i = val.
len - 1;
401 if(p_end_i < 0) p_end_i = 0;
402 else if(val.
len <= p_end_i) p_end_i = val.
len - 1;
409 fprintf(stderr,
"[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
413 seq = (
char*)malloc(p_end_i - p_beg_i + 2);
414 while ( (c=
bgzf_getc(fai->
bgzf))>=0 && l < p_end_i - p_beg_i + 1)
415 if (isgraph(c)) seq[l++] = c;