42 fp = fopen(file_name,
"wt");
45 log_error(stderr,
"BamOutput: could not open %s for writing\n", file_name);
52 setvbuf(fp, NULL, _IOFBF, 256 * 1024);
80 output[i] = cigar_op_len << 4 |
cigar_op;
83 assert((
unsigned long)(&output[i] - alnd.
cigar) < (
unsigned long) (
sizeof(alnd.
cigar) *
sizeof(alnd.
cigar[0]) - 1));
87 read_len += cigar_entry.
m_len;
98 template <
typename T>
int itoa(
char *buf, T in)
101 bool negative =
false;
113 buf[len] =
"0123456789"[in % 10];
126 for(
int c = 0; c < len / 2; c++)
130 buf[c] = buf[len - c - 1];
131 buf[len - c - 1] = tmp;
141 uint32 BamOutput::generate_md_string(BAM_alignment& alnh, BAM_alignment_data_block& alnd,
const AlignmentData& alignment)
143 if (alignment.mds_vec == NULL)
145 log_warning(stderr,
" BAM: alignment %u from read %u has an empty MD string\n", alignment.aln_id, alignment.read_id);
146 alnd.md_string[0] =
'\0';
149 const uint32 mds_len =
uint32(alignment.mds_vec[0]) | (
uint32(alignment.mds_vec[1]) << 8);
150 char *buffer = alnd.md_string;
162 const uint8 op = alignment.mds_vec[i++];
167 uint8 l = alignment.mds_vec[i++];
170 while (i < mds_len && alignment.mds_vec[i] ==
MDS_MATCH)
171 l += alignment.mds_vec[i++];
173 buffer_len += itoa(buffer + buffer_len, l);
180 const char c =
dna_to_char(alignment.mds_vec[i++]);
181 buffer[buffer_len++] = c;
190 const uint8 l = alignment.mds_vec[i++];
201 const uint8 l = alignment.mds_vec[i++];
202 buffer[buffer_len++] =
'^';
203 for(
uint8 n = 0; n < l; n++)
205 buffer[buffer_len++] =
dna_to_char(alignment.mds_vec[i++]);
208 buffer[buffer_len++] =
'0';
216 }
while(i < mds_len);
218 buffer[buffer_len] =
'\0';
234 uint32 BamOutput::process_one_alignment(AlignmentData& alignment, AlignmentData& mate)
237 struct BAM_alignment alnh;
239 struct BAM_alignment_data_block alnd;
244 memset(&alnh, 0,
sizeof(alnh));
245 memset(&alnd, 0,
sizeof(alnd));
256 alnd.name = alignment.read_name;
257 alnh.bin_mq_nl = (
uint8)(strlen(alnd.name) + 1);
262 for(
uint32 i = 0; i < alignment.read_len; i += 2)
267 if (alignment.aln->m_rc)
271 s = complement(alignment.read_data[i]);
275 if (i + 1 < alignment.read_len)
277 s = complement(alignment.read_data[(i + 1)]);
282 s = alignment.read_data[alignment.read_len - i - 1];
286 if (i + 1 < alignment.read_len)
288 s = alignment.read_data[alignment.read_len - (i + 1) - 1];
294 alnd.seq[i / 2] = out_bp;
297 alnh.l_seq = alignment.read_len;
301 for(
uint32 i = 0; i < alignment.read_len; i++)
305 if (alignment.aln->m_rc)
306 q = alignment.qual[i];
308 q = alignment.qual[alignment.read_len - i - 1];
314 mapq = alignment.mapq;
317 if (alignment.aln->is_aligned() ==
false || mapq <
mapq_filter)
322 alnh.next_refID = -1;
325 alnd.md_string[0] =
'\0';
328 output_alignment(alnh, alnd);
334 if (alignment.aln->m_rc)
341 if (mate.aln->is_concordant())
344 if (!mate.aln->is_aligned())
347 if (mate.aln->is_rc())
360 alnh.next_refID = -1;
362 alnd.md_string[0] =
'\0';
364 output_alignment(alnh, alnd);
369 alnh.refID = seq_index;
373 alnh.bin_mq_nl |= (mapq << 8);
378 uint32 computed_cigar_len = generate_cigar(alnh, alnd, alignment);
380 if (computed_cigar_len != alignment.read_len)
382 log_error(stderr,
"BAM output : cigar length doesn't match read %u (%u != %u)\n",
384 computed_cigar_len, alignment.read_len);
390 if (mate.aln->is_aligned())
400 alnh.next_refID =
uint32(o_seq_index - seq_index);
405 if (o_seq_index != seq_index)
409 alnh.tlen =
nvbio::max(mate.cigar_pos + o_ref_cigar_len,
410 alignment.cigar_pos + ref_cigar_len) -
411 nvbio::min(mate.cigar_pos, alignment.cigar_pos);
413 if (mate.cigar_pos < alignment.cigar_pos)
414 alnh.tlen = -alnh.tlen;
423 alnh.next_refID = alnh.refID;
429 alnh.next_refID = -1;
435 alnd.ed = alignment.aln->ed();
436 alnd.score = alignment.aln->score();
438 alnd.second_score_valid =
false;
440 generate_md_string(alnh, alnd, alignment);
443 output_alignment(alnh, alnd);
448 void BamOutput::output_tag_uint32(DataBuffer& out,
const char *tag,
uint32 val)
450 out.append_data(tag, 2);
451 out.append_uint8(
'i');
452 out.append_uint32(val);
455 void BamOutput::output_tag_uint8(DataBuffer& out,
const char *tag,
uint8 val)
457 out.append_data(tag, 2);
458 out.append_uint8(
'c');
459 out.append_uint8(val);
462 void BamOutput::output_tag_string(DataBuffer& out,
const char *tag,
const char *val)
464 out.append_data(tag, 2);
465 out.append_uint8(
'Z');
466 out.append_string(val);
467 out.append_uint8(
'\0');
470 void BamOutput::output_alignment(BAM_alignment& alnh, BAM_alignment_data_block& alnd)
472 DataBuffer& out = data_buffers[ buffer_id ];
475 uint32 off_block_size = out.get_pos();
476 out.skip_ahead(
sizeof(alnh.block_size));
478 out.append_int32(alnh.refID);
479 out.append_int32(alnh.pos);
480 out.append_int32(alnh.bin_mq_nl);
481 out.append_int32(alnh.flag_nc);
482 out.append_int32(alnh.l_seq);
483 out.append_int32(alnh.next_refID);
484 out.append_int32(alnh.next_pos);
485 out.append_int32(alnh.tlen);
487 out.append_string(alnd.name);
488 out.append_uint8(
'\0');
491 const uint32 cigar_len = (alnh.flag_nc & 0xffff) *
sizeof(
uint32);
492 out.append_data(alnd.cigar, cigar_len);
494 out.append_data(alnd.seq, (alnh.l_seq + 1) / 2);
495 out.append_data(alnd.qual, alnh.l_seq);
500 output_tag_uint8(out,
"NM", alnd.ed);
501 output_tag_uint32(out,
"AS", alnd.score);
503 if (alnd.second_score_valid)
504 output_tag_uint32(out,
"XS", alnd.second_score);
506 output_tag_uint8(out,
"XM", alnd.mm);
507 output_tag_uint8(out,
"XO", alnd.gapo);
508 output_tag_uint8(out,
"XG", alnd.gape);
509 if (alnd.md_string[0])
510 output_tag_string(out,
"MD", alnd.md_string);
514 const int32 aln_len = out.get_pos() - off_block_size -
sizeof(alnh.block_size);
515 out.poke_int32(off_block_size, aln_len);
535 process_one_alignment(alignment, mate);
560 process_one_alignment(alignment, mate);
561 process_one_alignment(mate, alignment);
572 void BamOutput::write_block()
577 if (buffer_id == BUFFERS)
581 void BamOutput::flush_blocks()
583 #pragma omp parallel for
584 for (
int32 i = 0; i < buffer_id; ++i)
587 bgzf[i].
compress( compressed_buffers[i], data_buffers[i] );
588 bgzf[i].
end_block( compressed_buffers[i] );
594 for (
int32 i = 0; i < buffer_id; ++i)
597 fwrite( compressed_buffers[i].get_base_ptr(), compressed_buffers[i].get_pos(), 1, fp );
600 compressed_buffers[i].
rewind();
607 void BamOutput::output_header(
void)
609 int pos_l_text, pos_start_header, header_len;
612 DataBuffer& data_buffer = data_buffers[ buffer_id ];
617 pos_l_text = data_buffer.get_pos();
618 data_buffer.skip_ahead(
sizeof(
uint32));
620 pos_start_header = data_buffer.get_pos();
623 data_buffer.append_string(
"@HD\t");
624 data_buffer.append_string(
"VN:1.3\n");
628 data_buffer.append_string(
"@RG");
629 data_buffer.append_string(
"\tID:");
630 data_buffer.append_string(
rg_id.c_str());
634 data_buffer.append_string(
rg_string.c_str() );
636 data_buffer.append_string(
"\n");
638 data_buffer.append_string(
"@PG");
639 data_buffer.append_string(
"\tID:");
640 data_buffer.append_string(
pg_id.c_str());
641 data_buffer.append_string(
"\tPN:");
642 data_buffer.append_string(
pg_name.c_str());
644 data_buffer.append_string(
"\tVN:");
645 data_buffer.append_string(
pg_version.c_str());
646 data_buffer.append_string(
"\n");
651 header_len = data_buffer.get_pos() - pos_start_header;
652 data_buffer.poke_int32(pos_l_text, header_len);
662 data_buffer.append_int32(
int32( strlen(name) + 1 ) );
664 data_buffer.append_string(name);
665 data_buffer.append_int8(0);
681 if (data_buffers[ buffer_id ].get_pos())
689 static const unsigned char magic[28] = { 0037, 0213, 0010, 0004, 0000, 0000, 0000, 0000, 0000,
690 0377, 0006, 0000, 0102, 0103, 0002, 0000, 0033, 0000,
691 0003, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000 };
693 fwrite(magic,
sizeof(magic), 1, fp);