forked from samtools/hts-specs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSAMv1.tex
965 lines (892 loc) · 55.3 KB
/
SAMv1.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
\documentclass[10pt]{article}
\usepackage{color}
\definecolor{gray}{rgb}{0.7,0.7,0.7}
\usepackage{framed}
\usepackage{enumitem}
\usepackage{longtable}
\usepackage[pdfborder={0 0 0}]{hyperref}
\addtolength{\textwidth}{3.4cm}
\addtolength{\hoffset}{-1.7cm}
\addtolength{\textheight}{4cm}
\addtolength{\voffset}{-2cm}
\makeindex
\begin{document}
\input{SAMv1.ver}
\title{Sequence Alignment/Map Format Specification}
\author{The SAM/BAM Format Specification Working Group}
\date{\headdate}
\maketitle
\begin{quote}\small
The master version of this document can be found at
\url{https://github.com/samtools/hts-specs}.
This printing is version~\commitdesc\ from that repository,
last modified on the date shown above.
\end{quote}
\vspace*{1em}
\section{The SAM Format Specification}
SAM stands for Sequence Alignment/Map format. It is a TAB-delimited text
format consisting of a header section, which is optional, and an
alignment section. If present, the header must be prior to the
alignments. Header lines start with `{\tt @}', while alignment lines do
not. Each alignment line has 11 mandatory fields for essential alignment
information such as mapping position, and variable number of optional
fields for flexible or aligner specific information.
\subsection{An example}\label{sec:example}
Suppose we have the following alignment with bases in lower cases
clipped from the alignment. Read {\tt r001/1} and {\tt r001/2}
constitute a read pair; {\tt r003} is a chimeric read; {\tt r004}
represents a split alignment.
\begin{framed}\small
\begin{verbatim}
Coor 12345678901234 5678901234567890123456789012345
ref AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT
+r001/1 TTAGATAAAGGATA*CTG
+r002 aaaAGATAA*GGATA
+r003 gcctaAGCTAA
+r004 ATAGCT..............TCAGC
-r003 ttagctTAGGC
-r001/2 CAGCGGCAT
\end{verbatim}
\end{framed}
The corresponding SAM format is:
\begin{framed}\small
\begin{verbatim}
@HD VN:1.5 SO:coordinate
@SQ SN:ref LN:45
r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA *
r003 0 ref 9 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC *
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 83 ref 37 30 9M = 7 -39 CAGCGGCAT * NM:i:1
\end{verbatim}
\end{framed}
\pagebreak
\subsection{Terminologies and Concepts}
\begin{description}
\item[Template] A DNA/RNA sequence part of which is sequenced on a
sequencing machine or assembled from raw sequences.
\item[Segment] A contiguous sequence or subsequence.
\item[Read] A raw sequence that comes off a sequencing machine. A read
may consist of multiple segments. For sequencing data, reads are indexed by
the order in which they are sequenced.
\item[Linear alignment]
An alignment of a read to a single reference sequence that may include
insertions, deletions, skips and clipping, but may not include direction
changes (i.e. one portion of the alignment on forward strand and another
portion of alignment on reverse strand). A linear alignment can be
represented in a single SAM record.
% A alignment of a read to a single locus on the reference, where the
% alignment may have short insertions/deletions but does not have long gaps
% (e.g. due to introns or structural variation).
\item[Chimeric alignment]
An alignment of a read that cannot be represented as a linear alignment. A
chimeric alignment is represented as a set of linear alignments that do not
have large overlaps. Typically, one of the linear alignments in a chimeric
alignment is considered the ``representative'' alignment, and the others are
called ``supplementary'' and are distinguished by the supplementary alignment
flag. All the SAM records in a chimeric alignment have the same {\sf QNAME}
and the same values for 0x40 and 0x80 flags (see Section 1.4). The decision
regarding which linear alignment is representative is arbitrary.
\item[Read alignment]
A linear alignment or a chimeric alignment that is the complete
representation of the alignment of the read.
\item[Multiple mapping]
The correct placement of a read may be ambiguous, e.g. due to repeats. In
this case, there may be multiple read alignments for the same read. One of
these alignments is considered primary. All the other alignments have the
secondary alignment flag set in the SAM records that represent them. All the
SAM records have the same {\sf QNAME} and the same values for 0x40 and 0x80
flags. Typically the alignment designated primary is the best alignment, but
the decision may be arbitrary.\footnotemark[1]
\item[1-based coordinate system] A coordinate system where the first
base of a sequence is one. In this coordinate system, a region is
specified by a closed interval. For example, the region between the
3rd and the 7th bases inclusive is $[3,7]$. The SAM, VCF, GFF and Wiggle
formats are using the 1-based coordinate system.
\item[0-based coordinate system] A coordinate system where the first
base of a sequence is zero. In this coordinate system, a region is
specified by a half-closed-half-open interval. For example, the region
between the 3rd and the 7th bases inclusive is $[2,7)$. The BAM, BCFv2, BED,
and PSL formats are using the 0-based coordinate system.
\item[Phred scale] Given a probability $0<p\le 1$, the phred scale of $p$
equals $-10\log_{10}p$, rounded to the closest integer.
\end{description}
\footnotetext[1]{A chimeric alignment is primarily caused by structural
variations, gene fusions, misassemblies, RNA-seq or experimental protocols. It is more frequent given longer
reads. For a chimeric alignment, the linear alignments consisting of the aligment are largely
non-overlapping; each linear alignment may have high mapping quality and is
informative in SNP/INDEL calling. In contrast, multiple mappings are caused primarily by repeats. They are less frequent
given longer reads. If a read has multiple mappings, all these mappings are
almost entirely overlapping with each other; except the single-best optimal
mapping, all the other mappings get mapping quality $<$Q3
and are ignored by most SNP/INDEL callers.}
\subsection{The header section}
Each header line begins with character `{\tt @}' followed by a
two-letter record type code. In the header, each line is TAB-delimited
and except the {\tt @CO} lines, each data field follows a format `{\tt TAG:VALUE}' where {\tt TAG}
is a two-letter string that defines the content and the format of {\tt
VALUE}. Each header line should match: {\tt
/\char94@[A-Za-z][A-Za-z](\char92t[A-Za-z][A-Za-z0-9]:[
-\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/}. Tags containing lowercase letters are reserved for
end users.
The following table give the defined record types and tags. Tags with
`*' are required when the record type is present.
\begin{center}
\small
\begin{longtable}{|l|l|p{13.5cm}|}
\cline{1-3}
\multicolumn{2}{|l|}{\bf Tag} & {\bf Description} \\
\cline{1-3}
\multicolumn{2}{|l}{\tt @HD} & The header line. The first line if present. \\\cline{2-3}
& {\tt VN}* & Format version. \emph{Accepted format}: {\tt /\char94[0-9]+\char92.[0-9]+\$/}.\\\cline{2-3}
& {\tt SO} & Sorting order of alignments. \emph{Valid values}: {\tt unknown} (default), {\tt
unsorted}, {\tt queryname} and {\tt coordinate}. For coordinate sort, the major sort
key is the {\sf RNAME} field, with order defined by the order of {\tt @SQ} lines in the header. The
minor sort key is the {\sf POS} field. For alignments with equal {\sf RNAME} and {\sf POS}, order is
arbitrary. All alignments with `{\tt *}' in {\sf RNAME} field follow alignments with some other
value but otherwise are in arbitrary order.\\\cline{1-3}
\multicolumn{2}{|l}{\tt @SQ} & Reference sequence dictionary. The order of {\tt @SQ} lines defines the alignment sorting order.\\\cline{2-3}
& {\tt SN}* & Reference sequence name. Each {\tt @SQ} line must have a unique {\tt SN} tag. The value of this
field is used in the
alignment records in RNAME and PNEXT fields. Regular expression: {\tt [!-)+-\char60\char62-\char126][!-\char126]*}\\\cline{2-3}
& {\tt LN}* & Reference sequence length. \emph{Range}: {\tt [1,2$^{31}$-1]}\\\cline{2-3}
& {\tt AS} & Genome assembly identifier. \\\cline{2-3}
& {\tt M5} & MD5 checksum of the sequence in the uppercase, excluding spaces but including pads (as `*'s).\\\cline{2-3}
& {\tt SP} & Species.\\\cline{2-3}
& {\tt UR} & URI of the sequence. This value may start with one of the standard
protocols, e.g http: or ftp:. If it does not start with one of these protocols, it is assumed to be a file-system path.\\\cline{1-3}
\multicolumn{2}{|l}{\tt @RG} & Read group. Unordered multiple {\tt @RG} lines are allowed.\\\cline{2-3}
& {\tt ID}* & Read group identifier. Each {\tt @RG} line must have a unique {\tt ID}. The value of {\tt ID}
is used in the RG tags of alignment records. Must be unique among all read groups in header section. Read group IDs may be modified when merging SAM files in order to handle collisions.\\\cline{2-3}
& {\tt CN} & Name of sequencing center producing the read.\\\cline{2-3}
& {\tt DS} & Description.\\\cline{2-3}
& {\tt DT} & Date the run was produced (ISO8601 date or date/time).\\\cline{2-3}
& {\tt FO} & Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each read.
Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other characters. \emph{Format}: {\tt /\char92*|[ACMGRSVTWYHKDBN]+/}\\\cline{2-3}
& {\tt KS} & The array of nucleotide bases that correspond to the key sequence of each read.\\\cline{2-3}
& {\tt LB} & Library.\\\cline{2-3}
& {\tt PG} & Programs used for processing the read group.\\\cline{2-3}
& {\tt PI} & Predicted median insert size.\\\cline{2-3}
& {\tt PL} & Platform/technology used to produce the reads. \emph{Valid values}:
{\tt CAPILLARY}, {\tt LS454}, {\tt ILLUMINA}, {\tt SOLID}, {\tt HELICOS}, {\tt IONTORRENT} and {\tt PACBIO}.\\\cline{2-3}
& {\tt PU} & Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.\\\cline{2-3}
& {\tt SM} & Sample. Use pool name where a pool is being sequenced.\\\cline{1-3}
\multicolumn{2}{|l}{\tt @PG} & Program. \\\cline{2-3}
& {\tt ID}* & Program record identifier. Each {\tt @PG} line must have a unique {\tt ID}.
The value of {\tt ID} is used in the alignment {\tt PG} tag and {\tt PP} tags of other {\tt @PG} lines.
{\tt PG} IDs may be modified when merging SAM files in order to handle collisions.\\\cline{2-3}
& {\tt PN} & Program name \\\cline{2-3}
& {\tt CL} & Command line \\\cline{2-3}
& {\tt PP} & Previous {\tt @PG-ID}. Must match another {\tt @PG} header's {\tt ID} tag.
{\tt @PG} records may be chained using {\tt PP} tag, with the last record in the chain
having no {\tt PP} tag. This chain defines the order of programs that have been applied to the alignment.
{\tt PP} values may be modified when merging SAM files in order to handle collisions of {\tt PG} {\tt ID}s.
The first {\tt PG} record in a chain (i.e. the one referred to by the {\tt PG} tag in a SAM record)
describes the most recent program that operated on the SAM record.
The next {\tt PG} record in the chain describes the next most recent program that
operated on the SAM record. The {\tt PG} {\tt ID} on a SAM record is not required
to refer to the newest {\tt PG} record in a chain. It may refer to any {\tt PG}
record in a chain, implying that the SAM record has been operated on by the
program in that {\tt PG} record, and the program(s) referred to via the {\tt PP} tag. \\\cline{2-3}
& {\tt VN} & Program version \\\cline{1-3}
\multicolumn{2}{|l}{\tt @CO} & One-line text comment. Unordered multiple {\tt @CO} lines are allowed.\\
\cline{1-3}
\end{longtable}
\end{center}
\subsection{The alignment section: mandatory fields}
In the SAM format, each alignment line typically represents the linear
alignment of a segment. Each line has 11
mandatory fields. These fields always appear in the same order and must be
present, but their values can be `0' or `*' (depending on the field) if the
corresponding information is unavailable. The following table gives an overview
of the mandatory fields in the SAM format:
\begin{center}
\small
\begin{tabular}{rllll}
\hline
{\bf Col} & {\bf Field} & {\bf Type} & {\bf Regexp/Range} & {\bf Brief description} \\
\hline
1 & {\sf QNAME} & String & {\tt [!-?A-\char126]\{1,255\}} & Query template NAME\\
2 & {\sf FLAG} & Int & {\tt [0,2$^{16}$-1]} & bitwise FLAG \\
3 & {\sf RNAME} & String & {\tt \char92*|[!-()+-\char60\char62-\char126][!-\char126]*} & Reference sequence NAME\\
4 & {\sf POS} & Int & {\tt [0,2$^{31}$-1]} & 1-based leftmost mapping POSition \\
5 & {\sf MAPQ} & Int & {\tt [0,2$^8$-1]} & MAPping Quality \\
6 & {\sf CIGAR} & String & {\tt \char92*|([0-9]+[MIDNSHPX=])+} & CIGAR string \\
7 & {\sf RNEXT} & String & {\tt \char92*|=|[!-()+-\char60\char62-\char126][!-\char126]*} & Ref. name of the mate/next read\\
8 & {\sf PNEXT} & Int & {\tt [0,2$^{31}$-1]} & Position of the mate/next read \\
9 & {\sf TLEN} & Int & {\tt [-2$^{31}$+1,2$^{31}$-1]} & observed Template LENgth \\
10 & {\sf SEQ} & String & {\tt \char92*|[A-Za-z=.]+} & segment SEQuence\\
11 & {\sf QUAL} & String & {\tt [!-\char126]+} & ASCII of Phred-scaled base QUALity+33 \\
\hline
\end{tabular}
\end{center}
\begin{enumerate}
\item {\sf QNAME}: Query template NAME. Reads/segments having identical {\sf QNAME}
are regarded to come from the same template. A {\sf QNAME} `{\tt *}'
indicates the information is unavailable. In a SAM file, a read may occupy
multiple alignment lines, when its alignment is chimeric or when multiple
mappings are given.
\item {\sf FLAG}: bitwise FLAG. Each bit is explained in the following
table:
\begin{center}\small
\begin{tabular}{rl}
\hline
Bit & Description\\
\hline
0x1 & template having multiple segments in sequencing \\
0x2 & each segment properly aligned according to the aligner \\
0x4 & segment unmapped \\
0x8 & next segment in the template unmapped \\
0x10 & {\sf SEQ} being reverse complemented \\
0x20 & {\sf SEQ} of the next segment in the template being reversed \\
0x40 & the first segment in the template \\
0x80 & the last segment in the template \\
0x100 & secondary alignment \\
0x200 & not passing quality controls \\
0x400 & PCR or optical duplicate \\
0x800 & supplementary alignment \\
\hline
\end{tabular}
\end{center}
\begin{itemize}
\item For each read/contig in a SAM file, it is required that one and only
one line associated with the read satisfies \mbox{`{\sf FLAG} {\tt \& 0x900
== 0}'}. This line is called the \emph{primary line} of the read.
\item Bit 0x100 marks the alignment not to be used in certain analyses
when the tools in use are aware of this bit. It is typically used to
flag alternative mappings when multiple mappings are presented in a SAM.
\item Bit 0x800 indicates that the corresponding alignment line is part of
a chimeric alignment. A line flagged with 0x800 is called as a \emph{supplementary line}.
\item Bit 0x4 is the only reliable place to tell whether the read
is unmapped. If 0x4 is set, no assumptions can be made about {\sf
RNAME}, {\sf POS}, {\sf CIGAR}, {\sf MAPQ}, bits 0x2, 0x10, 0x100
and 0x800, and the bit 0x20 of the previous read in the template.
\item If 0x40 and 0x80 are both set, the read is part of a linear
template, but it is neither the first nor the last read. If both
0x40 and 0x80 are unset, the index of the read in the template
is unknown. This may happen for a non-linear template or the index
is lost in data processing.
\item If 0x1 is unset, no assumptions can be made about 0x2, 0x8,
0x20, 0x40 and 0x80.
\end{itemize}
\item {\sf RNAME}: Reference sequence NAME of the alignment. If {\tt
@SQ} header lines are present, {\sf RNAME} (if not `*') must be
present in one of the {\tt SQ-SN} tag. An unmapped segment without
coordinate has a `*' at this field. However, an unmapped segment may
also have an ordinary coordinate such that it can be placed at a
desired position after sorting. If {\sf RNAME} is `*', no assumptions
can be made about {\sf POS} and {\sf CIGAR}.
\item {\sf POS}: 1-based leftmost mapping POSition of the first matching
base. The first base in a reference sequence has coordinate 1. {\sf
POS} is set as 0 for an unmapped read without coordinate. If {\sf
POS} is 0, no assumptions can be made about {\sf RNAME} and {\sf
CIGAR}.
\item {\sf MAPQ}: MAPping Quality. It equals
$-10\log_{10}\Pr\{\mbox{mapping position is wrong}\}$, rounded to the
nearest integer. A value 255 indicates that the mapping quality is not
available.
\item {\sf CIGAR}: CIGAR string. The CIGAR operations are given in the
following table (set `*' if unavailable):
\begin{center}\small
\begin{tabular}{ccl}
\hline
Op & BAM & Description\\
\hline
{\tt M} & 0 & alignment match (can be a sequence match or mismatch)\\
{\tt I} & 1 & insertion to the reference \\
{\tt D} & 2 & deletion from the reference \\
{\tt N} & 3 & skipped region from the reference \\
{\tt S} & 4 & soft clipping (clipped sequences present in {\sf SEQ})\\
{\tt H} & 5 & hard clipping (clipped sequences NOT present in {\sf SEQ})\\
{\tt P} & 6 & padding (silent deletion from padded reference)\\
{\tt =} & 7 & sequence match \\
{\tt X} & 8 & sequence mismatch \\
\hline
\end{tabular}
\end{center}
\begin{itemize}
\item {\tt H} can only be present as the first and/or last operation.
\item {\tt S} may only have {\tt H} operations between them and the
ends of the {\sf CIGAR} string.
\item For mRNA-to-genome alignment, an {\tt N} operation represents an
intron. For other types of alignments, the interpretation of {\tt N}
is not defined.
\item Sum of lengths of the {\tt M/I/S/=/X} operations shall equal
the length of {\sf SEQ}.
\end{itemize}
\item {\sf RNEXT}: Reference sequence name of the primary alignment of the NEXT read in the
template. For the last read, the next read is the first
read in the template. If {\tt @SQ} header lines are present, {\sf
RNEXT} (if not `*' or `=') must be present in one of the {\tt SQ-SN}
tag. This field is set as `*' when the information is unavailable, and
set as `=' if {\sf RNEXT} is identical {\sf RNAME}. If not `=' and the
next read in the template has one primary mapping (see also bit
0x100 in {\sf FLAG}), this field is identical to {\sf RNAME} at the primary line of the
next read. If {\sf
RNEXT} is `*', no assumptions can be made on {\sf PNEXT} and bit
0x20.
\item {\sf PNEXT}: Position of the primary alignment of the NEXT read in the template. Set as
0 when the information is unavailable. This field equals {\sf POS} at the primary line of
the next read. If {\sf PNEXT} is 0, no assumptions can be made on
{\sf RNEXT} and bit 0x20.
\item {\sf TLEN}: signed observed Template LENgth. If all segments are
mapped to the same reference, the unsigned observed template length
equals the number of bases from the leftmost mapped base to the
rightmost mapped base. The leftmost segment has a plus sign and the
rightmost has a minus sign. The sign of segments in the middle is
undefined. It is set as 0 for single-segment template or when the
information is unavailable.
\item {\sf SEQ}: segment SEQuence. This field can be a `*' when the
sequence is not stored. If not a `*', the length of the sequence must
equal the sum of lengths of {\tt M/I/S/=/X} operations in {\sf CIGAR}.
An `=' denotes the base is identical to the reference base. No
assumptions can be made on the letter cases.
\item {\sf QUAL}: ASCII of base QUALity plus 33 (same as the quality
string in the Sanger FASTQ format). A base quality is the phred-scaled
base error probability which equals $-10\log_{10}\Pr\{\mbox{base is
wrong}\}$. This field can be a `*' when quality is not stored. If
not a `*', {\sf SEQ} must not be a `*' and the length of the quality string
ought to equal the length of {\sf SEQ}.
\end{enumerate}
\subsection{The alignment section: optional fields}
All optional fields follow the {\tt TAG:TYPE:VALUE} format
where {\tt TAG} is a two-character string that matches {\tt /[A-Za-z][A-Za-z0-9]/}.
Each {\tt TAG} can only appear once in one alignment line. A {\tt TAG}
containing lowercase letters are reserved for end users.
In an optional field, {\tt TYPE} is a single case-sensitive letter which
defines the format of {\tt VALUE}:
\begin{center}\small
\begin{tabular}{cll}
\hline
{\bf Type} & {\bf Regexp matching {\tt VALUE}} & {\bf Description} \\
\hline
A & {\tt [!-\char126]} & Printable character \\
i & {\tt [-+]?[0-9]+} & Singed 32-bit integer \\
f & {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?} & Single-precision floating number \\
Z & {\tt [\,\,\,!-\char126]+} & Printable string, including space\\
H & {\tt [0-9A-F]+} & Byte array in the Hex format\footnotemark[2]\\
B & {\tt [cCsSiIf](,[-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?)+} & Integer or numeric array\\
\hline
\end{tabular}
\footnotetext[2]{For example, a byte array {\tt \{0x1a,0xe3,0x1\}} corresponds to a Hex string `{\tt 1AE301}'.}
\end{center}
For an integer or numeric array (type `{\tt B}'), the first letter indicates the type of numbers
in the following comma separated array. The letter can be one of `{\tt cCsSiIf}', corresponding to
{\tt int8\_t} (signed 8-bit integer), {\tt uint8\_t} (unsigned 8-bit integer), {\tt int16\_t}, {\tt uint16\_t}, {\tt int32\_t}, {\tt uint32\_t}
and {\tt float}, respectively\footnotemark[3]. During import/export, the element type
may be changed if the new type is also compatible with the array.
\footnotetext[3]{Explicit typing eases format parsing and helps to reduce the file size when SAM is converted to BAM.}
{Predefined tags are shown in the following table. You can
freely add new tags, and if a new tag may be of general interest, you
can email {\tt [email protected]} to add the new tag
to the specification. Note that tags starting with `{\tt X}', `{\tt Y}'
and `{\tt Z}' or tags containing lowercase letters in either position
are reserved for local use and will not be formally
defined in any future version of this specification.}
\begin{center}\small
\begin{longtable}{ccp{12.5cm}}
\hline
{\bf Tag\footnotemark[4]} & {\bf Type} & {\bf Description} \\
\hline
{\tt X?} & ? & Reserved fields for end users (together with {\tt Y?} and {\tt Z?}) \\
{\tt AM} & i & The smallest template-independent mapping quality of segments in the rest \\
{\tt AS} & i & Alignment score generated by aligner \\
{\tt BC} & Z & Barcode sequence, with any quality scores stored in the {\tt QT} tag. \\
{\tt BQ} & Z & Offset to base alignment quality (BAQ), of the same length as the read sequence.
At the $i$-th read base, ${\rm BAQ}_i=Q_i-({\rm BQ}_i-64)$ where $Q_i$ is the $i$-th base quality. \\
{\tt CC} & Z & Reference name of the next hit; "=" for the same chromosome \\
{\tt CM} & i & Edit distance between the color sequence and the color reference (see also {\tt NM})\\
{\tt CO} & Z & Free-text comments \\
{\tt CP} & i & Leftmost coordinate of the next hit \\
{\tt CQ} & Z & Color read quality on the original strand of the read. Same encoding as {\sf QUAL}; same length as {\tt CS}.\\
{\tt CS} & Z & Color read sequence on the original strand of the read. The primer base must be included.\\
{\tt CT} & Z & Complete read annotation tag, used for consensus annotation dummy features\footnotemark[5].\\
{\tt E2} & Z & The 2nd most likely base calls. Same encoding and same length as {\sf QUAL}.\\
{\tt FI} & i & The index of segment in the template.\\
{\tt FS} & Z & Segment suffix.\\
{\tt FZ} & B,S & Flow signal intensities on the original strand of the read, stored as {\tt (uint16\_t) round(value * 100.0)}. \\
{\tt LB} & Z & Library. Value to be consistent with the header {\tt RG-LB} tag if {\tt @RG} is present.\\
{\tt H0} & i & Number of perfect hits\\
{\tt H1} & i & Number of 1-difference hits (see also {\tt NM})\\
{\tt H2} & i & Number of 2-difference hits \\
{\tt HI} & i & Query hit index, indicating the alignment record is the i-th one stored in SAM\\
{\tt IH} & i & Number of stored alignments in SAM that contains the query in the current record\\
{\tt MD} & Z & String for mismatching positions. \emph{Regex}: {\tt [0-9]+(([A-Z]|\char92\char94[A-Z]+)[0-9]+)*}\footnotemark[6]\\
{\tt MQ} & i & Mapping quality of the mate/next segment \\
{\tt NH} & i & Number of reported alignments that contains the query in the current record\\
{\tt NM} & i & Edit distance to the reference, including ambiguous bases but excluding clipping\\
{\tt OQ} & Z & Original base quality (usually before recalibration). Same encoding as {\sf QUAL}.\\
{\tt OP} & i & Original mapping position (usually before realignment) \\
{\tt OC} & Z & Original CIGAR (usually before realignment) \\
{\tt PG} & Z & Program. Value matches the header {\tt PG-ID} tag if {\tt @PG} is present. \\
{\tt PQ} & i & Phred likelihood of the template, conditional on both the mapping being correct \\
{\tt PT} & Z & Read annotations for parts of the padded read sequence\footnotemark[7]\\
{\tt PU} & Z & Platform unit. Value to be consistent with the header {\tt RG-PU} tag if {\tt @RG} is present.\\
{\tt QT} & Z & Phred quality of the barcode sequence in the {\tt BC} (or {\tt RT}) tag. Same encoding as {\sf QUAL}. \\
{\tt Q2} & Z & Phred quality of the mate/next segment sequence in the {\tt R2} tag. Same encoding as {\sf QUAL}.\\
{\tt R2} & Z & Sequence of the mate/next segment in the template. \\
{\tt RG} & Z & Read group. Value matches the header {\tt RG-ID} tag if {\tt @RG} is present in the header. \\
{\tt RT} & Z & Deprecated alternative to {\tt BC} tag originally used at Sanger. \\
{\tt SA} & Z & Other canonical alignments in a chimeric alignment, in the format of: (\emph{rname},\emph{pos},\emph{strand},\emph{CIGAR},\emph{mapQ},\emph{NM};)+.
Each element in the semi-colon delimited list represents a part of the chimeric alignment. Conventionally, at a supplementary line,
the first element points to the primary line.\\
{\tt SM} & i & Template-independent mapping quality \\
{\tt TC} & i & The number of segments in the template.\\
{\tt U2} & Z & Phred probility of the 2nd call being wrong conditional on the best being wrong. The same encoding as {\sf QUAL}. \\
{\tt UQ} & i & Phred likelihood of the segment, conditional on the mapping being correct \\
\hline
\end{longtable}
\end{center}
\footnotetext[4]{The {\tt GS}, {\tt GC}, {\tt GQ}, {\tt MF}, {\tt S2}
and {\tt SQ} are reserved for backward compatibility.}
\footnotetext[5]{The MD field aims to achieve SNP/indel calling without looking at
the reference. For example, a string `{\tt 10A5\char94AC6}' means from
the leftmost reference base in the alignment, there are 10 matches
followed by an A on the reference which is different from the aligned
read base; the next 5 reference bases are matches followed by a 2bp
deletion from the reference; the deleted sequence is AC; the last 6
bases are matches. The {\tt MD} field ought to match the {\sf CIGAR}
string.}
\footnotetext[6]{The {\tt CT} tag is intended primarily for annotation
dummy reads, and consists of a \emph{strand}, \emph{type} and zero or
more \emph{key}=\emph{value} pairs, each separated with semicolons.
The \emph{strand} field has four values as in GFF3, and supplements FLAG
bit 0x10 to allow unstranded (`{\tt .}'), and stranded but unknown strand
(`{\tt ?}') annotation. For these and annotation on the forward strand
(\emph{strand} set to `{\tt +}'), do not set FLAG bit 0x10. For
annotation on the reverse strand, set the \emph{strand} to `{\tt -}'
and set FLAG bit 0x10. The \emph{type} and any \emph{keys} and their
optional \emph{values} are all percent encoded according to
RFC3986 to escape meta-characters `{\tt =}', `{\tt \%}', `{\tt ;}',
`{\tt |}' or non-printable characters not matched by the isprint()
macro (with the C locale). For example a percent sign becomes
`{\tt \%2C}'. The CT record matches:
``{\tt \emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*}''.
%NOTE - This leaves open the possibility of allowing multiple such
%entries for a single CT tag to be combined with | as in the PT tag.
}%End of CT tag footnote
\footnotetext[7]{The {\tt PT} tag value has the format of a series of
tags separated by {\tt |}, each annotating a sub-region of the read.
Each tag consists of \emph{start}, \emph{end}, \emph{strand},
\emph{type} and zero or more \emph{key}=\emph{value} pairs, each
separated with semicolons. \emph{Start} and \emph{end} are 1-based
positions between one and the sum of the {\tt M/I/D/P/S/=/X}
{\sf CIGAR} operators, i.e. {\sf SEQ} length plus any pads. Note
any editing of the CIGAR string may require updating the `{\tt PT}'
tag coordinates, or even invalidate them.
As in GFF3, \emph{strand} is one of `{\tt +}' for forward strand tags,
`{\tt -}' for reverse strand, `{\tt .}' for unstranded or `{\tt ?}'
for stranded but unknown strand.
The \emph{type} and any \emph{keys} and their optional \emph{values}
are all percent encoded as in the {\tt CT} tag.
Formally the entire PT record matches:
``{\tt \emph{start};\emph{end};\emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*(\char92|\emph{start};\emph{end};\emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*)*}''.
}%End of PT tag footnote
\pagebreak
\section{Recommended Practice for the SAM Format}
\label{sec-recommended-practice}
This section describes the best practice for representing data in the
SAM format. They are not required in general, but may be required by a
specific software package for it to function properly.
\begin{enumerate}
\item The header section
\begin{enumerate}[label=\arabic*]
\item The {\tt @HD} line should be present with the {\tt SO} tag specified.
\item The {\tt @SQ} lines should be present if reads have been mapped.
\item When a {\tt RG} tag appears anywhere in the alignment section,
there should be a single corresponding {\tt @RG} line with matching
{\tt ID} tag in the header.
\item When a {\tt PG} tag appears anywhere in the alignment section,
there should be a single corresponding {\tt @PG} line with matching
{\tt ID} tag in the header.
\end{enumerate}
\item Adjacent CIGAR operations should be different.
\item No alignments should be assigned mapping quality 255.
\item Unmapped reads
\begin{enumerate}[label=\arabic*]
\item For a unmapped paired-end or mate-pair read whose mate is
mapped, the unmapped read should have {\sf RNAME} and {\sf POS}
identical to its mate.
\item If all segments in a template are unmapped, their {\sf RNAME}
should be set as `*' and {\sf POS} as 0.
\item If {\sf POS} plus the sum of lengths of {\tt M/=/X/D/N}
operations in {\sf CIGAR} exceeds the length specified in the {\tt
LN} field of the {\tt @SQ} header line (if exists) with an SN
equal to {\sf RNAME}, the alignment should be unmapped.
\end{enumerate}
\item Multiple mapping
\begin{enumerate}[label=\arabic*]
\item When one segment is present in multiple lines to represent a multiple
mapping of the segment, only one of these records should have the secondary
alignment flag bit (0x100) unset. {\sf RNEXT} and {\sf PNEXT} point to the
primary line of the next read in the template.
\item {\sf SEQ} and {\sf QUAL} of secondary alignments should be set
to `*' to reduce the file size.
\end{enumerate}
%\item There should be no overlap between segments of a read\footnote{Few/no
% existing aligners follow this practice.}.
\item Optional tags:
\begin{enumerate}[label=\arabic*]
\item If the template has more than 2 segments, the {\tt TC} tag
should be present.
\item The {\tt NM} tag should be present.
\end{enumerate}
\item Annotation dummy reads:
These have {\sf SEQ} set to {\tt *}, {\sf FLAG} bits 0x100 and 0x200
set (secondary and filtered), and a {\tt CT} tag.
\begin{enumerate}[label=\arabic*]
%Repeating what is in the tag's footnote:
% \item If the {\tt CT} tag's \emph{strand} is {\tt -}, FLAG bit 0x10
% (reverse complemented) should be set, and otherwise not set.
\item If you wish to store free text in a {\tt CT} tag, use the
\emph{key} value {\tt Note} (uppercase N) to match GFF3.
\item Multi-segment annotation (e.g. a gene with introns) should be
described with multiple lines in SAM (like a multi-segment read).
Where there is a clear biological direction (e.g. a gene), the first
segment ({\sf FLAG} bit 0x40) is used for the first section (e.g. the
$5'$ end of the gene). Thus a GenBank entry location like
{\tt complement(join(85052..85354,} {\tt 85441..85621,} {\tt 6097..86284))}
would have three lines in SAM with a common {\sf QNAME}: %And three lines in GFF3 too.
\begin{enumerate}
\item The $5'$ fragment {\sf FLAG} 883, {\sf POS} 86097, {\sf CIGAR} {\tt 188M}, and tags {\tt FI:i:1} and {\tt TC:i:3} %FLAG = 0x1 + 0x2 + 0x10 + 0x20 + 0x40 + 0x100 + 0x200
\item Middle fragment {\sf FLAG} 819, {\sf POS} 85441, {\sf CIGAR} {\tt 181M}, and tags {\tt FI:i:2} and {\tt TC:i:3} %FLAG = 0x1 + 0x2 + 0x10 + 0x20 + 0x100 + 0x200
\item The $3'$ fragment {\sf FLAG} 947, {\sf POS} 85052, {\sf CIGAR} {\tt 303M}, and tags {\tt FI:i:3} and {\tt TC:i:3} %FLAG = 0x1 + 0x2 + 0x10 + 0x20 + 0x80 + 0x100 + 0x200
\end{enumerate}
\item If converting GFF3 to SAM, store any \emph{key}, \emph{values}
from column 9 in the {\tt CT} tag, except for the unique ID which
is used for the QNAME. GFF3 columns 1 (seqid), 4 (start) and 5 (end)
are encoded using SAM columns RNAME, POS and CIGAR to hold the length.
GFF3 columns 3 (type) and 7 (strand) are stored explicitly in the
{\tt CT} tag. Remaining GFF3 columns 2 (source), 6 (score), and
8 (phase) are stored in the {\tt CT} tag using \emph{key} values
{\tt FSource}, {\tt FScore} and {\tt FPhase} (uppercase keys are
restricted in GFF3, so these names avoid clashes). Split location
features are described with multiple lines in GFF3, and similarly
become multi-segment dummy reads in SAM, with the {\sf RNEXT} and
{\sf PNEXT} columns filled in appropriately. In the absence of a
convention in SAM/BAM for reads wrapping the origin of a circular
genome, any GFF3 feature line wrapping the origin must be split into
two segments in SAM.
\end{enumerate}
\end{enumerate}
\pagebreak
\section{Guide for Describing Assembly Sequences in SAM}
\subsection{Unpadded versus padded representation}
To describe alignments, we can regard the reference sequence with no respect to
other alignments against it. Such a reference sequence is called an
\emph{unpadded reference}. A position on an unpadded reference, referred to
as an \emph{unpadded position}, is not affected by any alignments. When we use
unpadded references and positions to describe alignments, we say we are using
the \emph{unpadded representation}.
Alternatively, to describe the same alignments, we can modify the reference
sequence to contain pads that make room for sequences inserted relative to the
reference. A pad is effectively a gap and conventionally represented by an
asterisk `*'. A reference sequence containing pads is called a \emph{padded
reference}. A position which counts the *'s is referred to as a \emph{padded
position}. A padded reference sequence may be affected by the query alignments
and because of gap insertions is typically longer than the unpadded reference.
The padded position of one query alignment may be affected by other query
alignments.
Unpadded and padded are different representations of the same alignments. They
are convertible to each other with no loss of any information. The unpadded
representation is more common due to the convenience of a fixed coordinate system,
while the padded representation has the advantage that alignments can be simply
described by the start and end coordinates without using complex CIGAR strings.
SAM traditionally uses the padded representation for {\it de novo} assembly.
The ACE assembly format uses the padded representation exclusively.
\subsection{Padded SAM}
The SAM format is typically used to describe alignments against an unpadded
reference sequence, but it is also able to describe alignments against a padded
reference. In the latter case, we say we are using a \emph{padded SAM}. A padded
SAM is a valid SAM, but with the difference that the reference and positions in
use are padded. There may be more than one way to describe the padded
representation. We recommend the following.
In a padded SAM, alignments and coordinates are described with respect to the
padded reference sequence. Unlike traditional padded representations like
the ACE file format where pads/gaps are recorded in reads using *'s, we do
not write *'s in the {\sf SEQ} field of the SAM format\footnote{Writing
pads/gaps as *'s in the {\sf SEQ} field might have been more convenient, but
this caused concerns for backward compatibility.}. Instead, we describe pads
in the query sequences as deletions from the padded reference using the
{\sf CIGAR} `{\tt D}' operation. In a padded SAM, the insertion and padding
CIGAR operations (`{\tt I}' and `{\tt P}') are not used because the padded
reference already considers all the insertions.
The following shows the padded SAM for the example alignment in Section~\ref{sec:example}.
Notably, the length of {\tt ref} is 47 instead of 45. {\sf POS} of the last
three alignments are all shifted by 2. {\sf CIGAR} of alignments bridging the
2bp insertion are also changed.
\begin{framed}\small
\begin{verbatim}
@HD VN:1.3 SO:coordinate
@SQ SN:ref LN:47
ref 516 ref 1 0 14M2D31M * 0 0 AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT *
r001 163 ref 7 30 14M1D3M = 39 41 TTAGATAAAGGATACTG *
* 768 ref 8 30 1M * 0 0 * * CT:Z:.;Warning;Note=Ref wrong?
r002 0 ref 9 30 3S6M1D5M * 0 0 AAAAGATAAGGATA * PT:Z:1;4;+;homopolymer
r003 0 ref 9 30 5H6M * 0 0 AGCTAA * NM:i:1
r004 0 ref 18 30 6M14N5M * 0 0 ATAGCTTCAGC *
r003 16 ref 31 30 6H5M * 0 0 TAGGC * NM:i:0
r001 83 ref 39 30 9M = 7 -41 CAGCGCCAT *
\end{verbatim}
\end{framed}
Here we also exemplify the recommended practice for storing the reference
sequence and the reference annotations in SAM when necessary. For a reference
sequence in SAM, {\sf QNAME} should be identical to {\sf RNAME}, {\sf POS} set
to 1 and {\sf FLAG} to 516 (filtered and unmapped); for an annotation, {\sf
FLAG} should be set to 768 (filtered and secondary) with no restriction to {\sf
QNAME}. Dummy reads for annotation would typically have an `{\tt CT}' tag to
hold the annotation information, see Section~\ref{sec-recommended-practice}.
\pagebreak
\section{The BAM Format Specification}
\subsection{The BGZF compression format}
BGZF is block compression implemented on top of the standard gzip file
format. The goal of BGZF is to provide good compression while allowing
efficient random access to the BAM file for indexed queries. The BGZF
format is `gunzip compatible', in the sense that a compliant gunzip
utility can decompress a BGZF compressed file\footnote{It is worth noting that there is a known bug in the Java {\sf
GZIPInputStream} class that concatenated gzip archives cannot be
successfully decompressed by this class. BGZF files can be created and
manipulated using the built-in Java {\sf util.zip} package, but naive
use of {\sf GZIPInputStream} on a BGZF file will not work due to this
bug.}.
A BGZF archive is a series of concatenated BGZF blocks. Each BGZF block
is itself a spec-compliant gzip archive which contains an "extra field"
in the format described in RFC1952. The gzip file format allows the
inclusion of application-specific extra fields and these are ignored by
compliant decompression implementation. The gzip specification also
allows gzip files to be concatenated. The result of decompressing
concatenated gzip files is the concatenation of the uncompressed data.
Each BGZF block contains a standard gzip file header with the following
standard-compliant extensions:
\begin{enumerate}
\item The {\sf F.EXTRA} bit in the header is set to indicate that extra
fields are present.
\item The extra field used by BGZF uses the two subfield ID values 66 and 67 (ascii `BC').
\item The length of the BGZF extra field payload (field {\sf LEN} in the
gzip specification) is 2 (two bytes of payload).
\item The payload of the BGZF extra field is a 16-bit unsigned integer
in little endian format. This integer gives the size of the containing
BGZF block minus one.
\end{enumerate}
On disk, a full BGZF file is (all integers are little endian as is
required by RFC1952):
\begin{table}[ht]
\centering
{\small
\begin{tabular}{|l|l|l|l|l|r|}
\cline{1-6}
\multicolumn{3}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-6}
\multicolumn{6}{|c|}{\textcolor{gray}{\it List of compression blocks (until the end of the file)}} \\\cline{2-6}
& \multicolumn{2}{l|}{\sf ID1} & gzip IDentifier1 & {\tt uint8\_t} & 31 \\\cline{2-6}
& \multicolumn{2}{l|}{\sf ID2} & gzip IDentifier2 & {\tt uint8\_t} & 139 \\\cline{2-6}
& \multicolumn{2}{l|}{\sf CM} & gzip Compression Method & {\tt uint8\_t} & 8 \\\cline{2-6}
& \multicolumn{2}{l|}{\sf FLG} & gzip FLaGs & {\tt uint8\_t} & 4 \\\cline{2-6}
& \multicolumn{2}{l|}{\sf MTIME} & gzip Modification TIME & {\tt uint32\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf XFL} & gzip eXtra FLags & {\tt uint8\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf OS} & gzip Operating System & {\tt uint8\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf XLEN} & gzip eXtra LENgth & {\tt uint16\_t} & \\\cline{2-6}
& \multicolumn{5}{c|}{\textcolor{gray}{\it Extra subfield(s) (total size=XLEN)}} \\\cline{3-6}
& & \multicolumn{4}{c|}{\textcolor{gray}{\it Additional RFC1952 extra subfields if present}} \\\cline{3-6}
& & {\sf SI1} & Subfield Identifier1 & {\tt uint8\_t} & 66 \\\cline{3-6}
& & {\sf SI2} & Subfield Identifier2 & {\tt uint8\_t} & 67 \\\cline{3-6}
& & {\sf SLEN} & Subfield LENgth & {\tt uint16\_t} & 2 \\\cline{3-6}
& & {\sf BSIZE} & total Block SIZE minus 1 & {\tt uint16\_t} & \\\cline{3-6}
& & \multicolumn{4}{c|}{\textcolor{gray}{\it Additional RFC1952 extra subfields if present}} \\\cline{2-6}
& \multicolumn{2}{l|}{\sf CDATA} & Compressed DATA by {\sf zlib::deflate()} & {\tt uint8\_t[{\sf BSIZE-XLEN-19}]} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf CRC32} & CRC-32 & {\tt uint32\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf ISIZE} & Input SIZE (length of uncompressed data) & {\tt uint32\_t} & \\
\cline{1-6}
\end{tabular}}
\end{table}
BGZF files support random access through the BAM file index. To achieve
this, the BAM file index uses \emph{virtual file offsets} into the BGZF
file. Each virtual file offset is an unsigned 64-bit integer, defined as: {\tt
coffset\char60\char60 16\char124uoffset}, where {\tt coffset} is an
unsigned byte offset into the BGZF file to the beginning of a BGZF
block, and {\tt uoffset} is an unsigned byte offset into the
uncompressed data stream represented by that BGZF block. Virtual file
offsets can be compared, but subtraction between virtual file offsets
and addition between a virtual offset and an integer are both
disallowed.
\subsection{The BAM format}
BAM is compressed in the BGZF format. All multi-byte numbers in BAM are
little-endian, regardless of the machine endianness. The format is
formally described in the following table where values in brackets are
the default when the corresponding information is not available; an
underlined word in uppercase denotes a field in the SAM format.
\begin{table}[ht]
\centering
{\small
\begin{tabular}{|l|l|l|p{8.15cm}|l|r|}
\cline{1-6}
\multicolumn{3}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-6}
\multicolumn{3}{|l|}{\sf magic} & BAM magic string & {\tt char[4]} & {\tt BAM\char92 1}\\\cline{1-6}
\multicolumn{3}{|l|}{\sf l\_text} & Length of the header text, including any {\tt NULL} padding & {\tt int32\_t} & \\\cline{1-6}
\multicolumn{3}{|l|}{\sf text} & Plain header text in SAM; not necessarily {\tt NULL} terminated & {\tt char[{\sf l\_text}]} & \\\cline{1-6}
\multicolumn{3}{|l|}{\sf n\_ref} & \# reference sequences & {\tt int32\_t} & \\\cline{1-6}
\multicolumn{6}{|c|}{\textcolor{gray}{\it List of reference information (n=n\_ref)}} \\\cline{2-6}
& \multicolumn{2}{l|}{\sf l\_name} & Length of the reference name plus 1 (including {\tt NULL}) & {\tt int32\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf name} & Reference sequence name; {\tt NULL} terminated & {\tt char[{\sf l\_name}]} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf l\_ref} & Length of the reference sequence & {\tt int32\_t} & \\\cline{1-6}
\multicolumn{6}{|c|}{\textcolor{gray}{\it List of alignments (until the end of the file)}} \\\cline{2-6}
& \multicolumn{2}{l|}{\sf block\_size} & Length of the remainder of the alignment record & {\tt int32\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf refID} & Reference sequence ID, $-1\leq{\sf refID}<{\sf n\_ref}$; -1 for a read without a mapping position. & {\tt int32\_t} & [-1] \\\cline{2-6}
& \multicolumn{2}{l|}{\sf pos} & 0-based leftmost coordinate ($=\underline{\sf POS}-1$)& {\tt int32\_t} & [-1]\\\cline{2-6}
& \multicolumn{2}{l|}{\sf bin\_mq\_nl} & {\tt{\sf bin}\char60\char60 16\char124\underline{\sf MAPQ}\char60\char60 8\char124{\sf l\_read\_name}}; {\sf bin} is computed by the {\sf reg2bin()} function in Section~\ref{sec:code}; {\sf l\_read\_name} is the length of {\sf read\_name} below ($={\sf length}(\underline{\sf QNAME})+1$). & {\tt uint32\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf flag\_nc} & {\tt \underline{\sf FLAG}\char60\char60 16\char124{\sf n\_cigar\_op}}; {\sf n\_cigar\_op} is the number of operations in \underline{\sf CIGAR}. & {\tt uint32\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf l\_seq} & Length of \underline{\sf SEQ} & {\tt int32\_t} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf next\_refID} & Ref-ID of the next segment ($-1\le{\sf mate\_refID}<{\sf n\_ref}$) & {\tt int32\_t} & [-1] \\\cline{2-6}
& \multicolumn{2}{l|}{\sf next\_pos} & 0-based leftmost pos of the next segment ($=\underline{\sf PNEXT}-1$) & {\tt int32\_t} & [-1] \\\cline{2-6}
& \multicolumn{2}{l|}{\sf tlen} & Template length ($=\underline{\sf TLEN}$) & {\tt int32\_t} & [0] \\\cline{2-6}
& \multicolumn{2}{l|}{\sf read\_name} & Read name\footnotemark[1], {\tt NULL} terminated (\underline{\sf QNAME} plus a tailing `{\tt \char92 0}') & {\tt char[{\sf l\_read\_name}]} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf cigar} & CIGAR: {\tt {\sf op\_len}\char60\char60 4\char124{\sf op}}. `{\tt MIDNSHP\char61X}'$\to$`012345678' & {\tt uint32\_t[{\sf n\_cigar\_op}]} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf seq} & 4-bit encoded read: `{\tt =ACMGRSVTWYHKDBN}'$\to[0,15]$; other characters mapped to `{\tt N}'; high nybble first (1st base in the highest 4-bit of the 1st byte) & {\tt uint8\_t[({\sf l\_seq}+1)/2]} & \\\cline{2-6}
& \multicolumn{2}{l|}{\sf qual} & Phred base quality (a sequence of {\tt 0xFF} if absent) & {\tt char[{\sf l\_seq}]} & \\\cline{2-6}
& \multicolumn{5}{c|}{\textcolor{gray}{\it List of auxiliary data (until the end of the alignment block)}} \\\cline{3-6}
& & {\sf tag} & Two-character tag & {\tt char[2]} & \\\cline{3-6}
& & {\sf val\_type} & Value type: {\tt AcCsSiIfZHB}\footnotemark[2]$^,$\footnotemark[3] & {\tt char} & \\\cline{3-6}
& & {\sf value} & Tag value & (by {\sf val\_type}) &\\
\cline{1-6}
\end{tabular}}
\end{table}
\footnotetext[1]{For backward compatibility, a {\sf QNAME} `{\tt *}' is stored as a C string {\tt "*\char92 0"}.}
\footnotetext[2]{An integer may be stored as one of `{\tt cCsSiI}' in BAM, representing {\tt int8\_t}, {\tt uint8\_t},
{\tt int16\_t}, {\tt uint16\_t}, {\tt int32\_t} and {\tt uint32\_t}, respectively. In SAM, all single integer types are mapped to {\tt int32\_t}.}
\footnotetext[3]{A `{\tt B}'-typed (array) tag--value pair is stored as follows. The first two bytes keep the two-character tag. The 3rd byte is always `{\tt B}'.
The 4th byte, matching {\tt /\char94[cCsSiIf]\$/}, indicates the type of an element in the array.
Bytes from 5 to 8 encode a little-endian 32-bit integer which gives the number of elements in the array.
Bytes starting from the 9th store the array in the little-endian byte order; the number of these
bytes is determined by the type and the length of the array.}
\pagebreak
\section{Indexing BAM}
Indexing aims to achieve fast retrieval of alignments overlapping a
specified region without going through the whole alignments. BAM must be
sorted by the reference ID and then the leftmost coordinate before
indexing.
\subsection{Algorithm}
\subsubsection{Basic binning index}
The UCSC binning scheme was suggested by Richard Durbin and Lincoln
Stein and is explained by Kent et al. (2002). In this scheme, each bin
represents a contiguous genomic region which is either fully contained
in or non-overlapping with another bin; each alignment is associated
with a bin which represents the smallest region containing the entire
alignment. The binning scheme is essentially a representation of
R-tree. A distinct bin uniquely corresponds to a distinct internal node
in a R-tree. Bin A is a child of Bin B if the region represented by A is
contained in B.
To find the alignments that overlap a specified region, we need to get
the bins that overlap the region, and then test each alignment in the
bins to check overlap. To quickly find alignments associated with a
specified bin, we can keep in the index the start file offsets of chunks
of alignments which all have the bin. As alignments are sorted by the
leftmost coordinates, alignments having the same bin tend to be
clustered together on the disk and therefore usually a bin is only
associated with a few chunks. Traversing all the alignments having the
same bin usually needs a few seek calls. Given the set of bins that
overlap the specified region, we can visit alignments in the order of
their leftmost coordinates and stop seeking the rest when an alignment
falls outside the required region. This strategy saves half of the seek
calls in average.
In BAM, each bin may span $2^{29}$, $2^{26}$, $2^{23}$, $2^{20}$,
$2^{17}$ or $2^{14}$ bp\footnote{Due to a limitation in the current indexing
scheme, a chromosome sequence longer than $2^{29}-1$ is not supported during
indexing.}. Bin 0 spans a 512Mbp region, bins 1--8 span 64Mbp, 9--72 8Mbp,
73--584 1Mbp, 585--4680 128Kbp and bins 4681--37449
span 16Kbp regions.
\subsubsection{Reducing small chunks}
Around the boundary of two adjacent bins, we may see many small chunks
with some having a shorter bin while the rest having a larger bin. To
reduce the number of seek calls, we may join two chunks having the same
bin if they are close to each other. After this process, a joined chunk
will contain alignments with different bins. We need to keep in the
index the file offset of the end of each chunk to identify its
boundaries.
\subsubsection{Combining with linear index}
For an alignment starting beyond 64Mbp, we always need to seek to some
chunks in bin 0, which can be avoided by using a linear index. In the
linear index, for each tiling 16384bp window on the reference, we record
the smallest file offset of the alignments that start in the
window. Given a region [rbeg,rend), we only need to visit a chunk whose
end file offset is larger than the file offset of the 16kbp window
containing rbeg.
With both binning and linear indices, we can retrieve alignments in most
of regions with just one seek call.
\subsubsection{A conceptual example}
Suppose we have a genome shorter than 144kbp. we can design a binning
scheme which consists of three types of bins: bin 0 spans 0-144kbp, bin
1, 2 and 3 span 48kbp and bins from 4 to 12 span 16kbp each:
\begin{table}[ht]
\centering
{\small\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
\cline{1-9}
\multicolumn{9}{|c|}{0 (0--144kbp)}\\\cline{1-9}
\multicolumn{3}{|c|}{1 (0--48kbp)} & \multicolumn{3}{c|}{2 (48--96kbp)} & \multicolumn{3}{c|}{1 (96--144kbp)} \\\cline{1-9}
4 (0--16k) & 5 (16--32k) & 6 (32--48k) & 7 (48--64k) & 8 (64--80k) & 9 (80--96k) & 10 & 11 & 12 \\
\cline{1-9}
\end{tabular}}
\end{table}
An alignment starting at 65kbp and ending at 67kbp would have a bin
number 8, which is the smallest bin containing the alignment. Similarly,
an alignment starting at 51kbp and ending at 70kbp would go to bin 2,
while an alignment between [40k,49k] to bin 0. Suppose we want to find
all the alignments overlapping region [65k,71k). We first calculate that
bin 0, 2 and 8 overlap with this region and then traverse the alignments
in these bins to find the required alignments. With a binning index
alone, we need to visit the alignment at [40k,49k] as it belongs to bin
0. But with a linear index, we know that such an alignment stops before
64kbp and cannot overlap the specified region. A seek call can thus be
saved.
\subsection{The BAM indexing format}
\begin{table}[ht]
{\small
\begin{tabular}{|l|l|l|l|l|l|r|}
\cline{1-7}
\multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-7}
\multicolumn{4}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BAI\char92 1}\\\cline{1-7}
\multicolumn{4}{|l|}{\sf n\_ref} & \# reference sequences & {\tt int32\_t} & \\\cline{1-7}
\multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}} \\\cline{2-7}
& \multicolumn{3}{l|}{\sf n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\\cline{2-7}
& \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\\cline{3-7}
& & \multicolumn{2}{l|}{\sf bin} & Distinct bin & {\tt uint32\_t} & \\\cline{3-7}
& & \multicolumn{2}{l|}{\sf n\_chunk} & \# chunks & {\tt int32\_t} & \\\cline{3-7}
& & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\\cline{4-7}
& & & {\sf chunk\_beg} & (Virtual) file offset of the start of the chunk & {\tt uint64\_t} & \\\cline{4-7}
& & & {\sf chunk\_end} & (Virtual) file offset of the end of the chunk & {\tt uint64\_t} & \\\cline{2-7}
& \multicolumn{3}{l|}{\sf n\_intv} & \# 16kbp intervals (for the linear index) & {\tt int32\_t} & \\\cline{2-7}
& \multicolumn{6}{c|}{\textcolor{gray}{\it List of intervals (n=n\_intv)}} \\\cline{3-7}
& & \multicolumn{2}{l|}{\sf ioffset} & (Virtual) file offset of the first alignment in the interval & {\tt uint64\_t} & \\
\cline{1-7}
\end{tabular}}
\end{table}
\subsection{C source code for computing bin number and overlapping bins}\label{sec:code}
{\small
\begin{verbatim}
/* calculate bin given an alignment covering [beg,end) (zero-based, half-close-half-open) */
int reg2bin(int beg, int end)
{
--end;
if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
return 0;
}
/* calculate the list of bins that may overlap with region [beg,end) (zero-based) */
#define MAX_BIN (((1<<18)-1)/7)
int reg2bins(int beg, int end, uint16_t list[MAX_BIN])
{
int i = 0, k;
--end;
list[i++] = 0;
for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
return i;
}
\end{verbatim}
}
\end{document}