SAMv1.tex

\documentclass[10pt]{article}
\usepackage{color}
\definecolor{gray}{rgb}{0.7,0.7,0.7}
\usepackage{framed}
\usepackage{enumitem}
\usepackage{longtable}
\usepackage[pdfborder={0 0 0}]{hyperref}

\addtolength{\textwidth}{3.4cm}
\addtolength{\hoffset}{-1.7cm}
\addtolength{\textheight}{4cm}
\addtolength{\voffset}{-2cm}

\makeindex

\begin{document}

\input{SAMv1.ver}
\title{Sequence Alignment/Map Format Specification}
\author{The SAM/BAM Format Specification Working Group}
\date{\headdate}
\maketitle
\begin{quote}\small
The master version of this document can be found at
\url{https://github.com/samtools/hts-specs}.
This printing is version~\commitdesc\ from that repository,
last modified on the date shown above.
\end{quote}
\vspace*{1em}


\section{The SAM Format Specification}
SAM stands for Sequence Alignment/Map format. It is a TAB-delimited text
format consisting of a header section, which is optional, and an
alignment section. If present, the header must be prior to the
alignments. Header lines start with `{\tt @}', while alignment lines do
not. Each alignment line has 11 mandatory fields for essential alignment
information such as mapping position, and variable number of optional
fields for flexible or aligner specific information.

\subsection{An example}\label{sec:example}
Suppose we have the following alignment with bases in lower cases
clipped from the alignment. Read {\tt r001/1} and {\tt r001/2}
constitute a read pair; {\tt r003} is a chimeric read; {\tt r004}
represents a split alignment.

\begin{framed}\small
\begin{verbatim}
Coor     12345678901234  5678901234567890123456789012345
ref      AGCATGTTAGATAA**GATAGCTGTGCTAGTAGGCAGTCAGCGCCAT

+r001/1        TTAGATAAAGGATA*CTG
+r002         aaaAGATAA*GGATA
+r003       gcctaAGCTAA
+r004                     ATAGCT..............TCAGC
-r003                            ttagctTAGGC
-r001/2                                        CAGCGGCAT
\end{verbatim}
\end{framed}
The corresponding SAM format is:
\begin{framed}\small
\begin{verbatim}
@HD VN:1.5 SO:coordinate
@SQ SN:ref LN:45
r001  163 ref  7 30 8M2I4M1D3M = 37  39 TTAGATAAAGGATACTG *
r002    0 ref  9 30 3S6M1P1I4M *  0   0 AAAAGATAAGGATA    *
r003    0 ref  9 30 5S6M       *  0   0 GCCTAAGCTAA       * SA:Z:ref,29,-,6H5M,17,0;
r004    0 ref 16 30 6M14N5M    *  0   0 ATAGCTTCAGC       *
r003 2064 ref 29 17 6H5M       *  0   0 TAGGC             * SA:Z:ref,9,+,5S6M,30,1;
r001   83 ref 37 30 9M         =  7 -39 CAGCGGCAT         * NM:i:1
\end{verbatim}
\end{framed}

\pagebreak

\subsection{Terminologies and Concepts}

\begin{description}
\item[Template] A DNA/RNA sequence part of which is sequenced on a
  sequencing machine or assembled from raw sequences.

\item[Segment] A contiguous sequence or subsequence.

\item[Read] A raw sequence that comes off a sequencing machine. A read
  may consist of multiple segments. For sequencing data, reads are indexed by
  the order in which they are sequenced.

\item[Linear alignment]
  An alignment of a read to a single reference sequence that may include
  insertions, deletions, skips and clipping, but may not include direction
  changes (i.e. one portion of the alignment on forward strand and another
  portion of alignment on reverse strand). A linear alignment can be
  represented in a single SAM record.
%  A alignment of a read to a single locus on the reference, where the
%  alignment may have short insertions/deletions but does not have long gaps
%  (e.g. due to introns or structural variation).

\item[Chimeric alignment]
  An alignment of a read that cannot be represented as a linear alignment. A
  chimeric alignment is represented as a set of linear alignments that do not
  have large overlaps.  Typically, one of the linear alignments in a chimeric
  alignment is considered the ``representative'' alignment, and the others are
  called ``supplementary'' and are distinguished by the supplementary alignment
  flag.  All the SAM records in a chimeric alignment have the same {\sf QNAME}
  and the same values for 0x40 and 0x80 flags (see Section 1.4). The decision
  regarding which linear alignment is representative is arbitrary.

\item[Read alignment]
  A linear alignment or a chimeric alignment that is the complete
  representation of the alignment of the read.

\item[Multiple mapping]
  The correct placement of a read may be ambiguous, e.g. due to repeats.  In
  this case, there may be multiple read alignments for the same read.  One of
  these alignments is considered primary.  All the other alignments have the
  secondary alignment flag set in the SAM records that represent them.  All the
  SAM records have the same {\sf QNAME} and the same values for 0x40 and 0x80
  flags.  Typically the alignment designated primary is the best alignment, but
  the decision may be arbitrary.\footnotemark[1]

\item[1-based coordinate system] A coordinate system where the first
  base of a sequence is one. In this coordinate system, a region is
  specified by a closed interval. For example, the region between the
  3rd and the 7th bases inclusive is $[3,7]$. The SAM, VCF, GFF and Wiggle
  formats are using the 1-based coordinate system.

\item[0-based coordinate system] A coordinate system where the first
  base of a sequence is zero. In this coordinate system, a region is
  specified by a half-closed-half-open interval. For example, the region
  between the 3rd and the 7th bases inclusive is $[2,7)$. The BAM, BCFv2, BED,
  and PSL formats are using the 0-based coordinate system.

\item[Phred scale] Given a probability $0<p\le 1$, the phred scale of $p$
  equals $-10\log_{10}p$, rounded to the closest integer.

\end{description}

\footnotetext[1]{A chimeric alignment is primarily caused by structural
variations, gene fusions, misassemblies, RNA-seq or experimental protocols. It is more frequent given longer
reads. For a chimeric alignment, the linear alignments consisting of the aligment are largely
non-overlapping; each linear alignment may have high mapping quality and is
informative in SNP/INDEL calling. In contrast, multiple mappings are caused primarily by repeats.  They are less frequent
given longer reads. If a read has multiple mappings, all these mappings are
almost entirely overlapping with each other; except the single-best optimal
mapping, all the other mappings get mapping quality $<$Q3
and are ignored by most SNP/INDEL callers.}

\subsection{The header section}
Each header line begins with character `{\tt @}' followed by a
two-letter record type code. In the header, each line is TAB-delimited
and except the {\tt @CO} lines, each data field follows a format `{\tt TAG:VALUE}' where {\tt TAG}
is a two-letter string that defines the content and the format of {\tt
  VALUE}. Each header line should match: {\tt
  /\char94@[A-Za-z][A-Za-z](\char92t[A-Za-z][A-Za-z0-9]:[
  -\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/}. Tags containing lowercase letters are reserved for
end users.

The following table give the defined record types and tags. Tags with
`*' are required when the record type is present.

\begin{center}
\small
\begin{longtable}{|l|l|p{13.5cm}|}
  \cline{1-3}
  \multicolumn{2}{|l|}{\bf Tag} & {\bf Description} \\
  \cline{1-3}
  \multicolumn{2}{|l}{\tt @HD} & The header line. The first line if present. \\\cline{2-3}
  & {\tt VN}* & Format version. \emph{Accepted format}: {\tt /\char94[0-9]+\char92.[0-9]+\$/}.\\\cline{2-3}
  & {\tt SO} & Sorting order of alignments. \emph{Valid values}: {\tt unknown} (default), {\tt
    unsorted}, {\tt queryname} and {\tt coordinate}. For coordinate sort, the major sort
  key is the {\sf RNAME} field, with order defined by the order of {\tt @SQ} lines in the header.  The
  minor sort key is the {\sf POS} field.  For alignments with equal {\sf RNAME} and {\sf POS}, order is
  arbitrary.  All alignments with `{\tt *}' in {\sf RNAME} field follow alignments with some other
  value but otherwise are in arbitrary order.\\\cline{1-3}
  \multicolumn{2}{|l}{\tt @SQ} & Reference sequence dictionary. The order of {\tt @SQ} lines defines the alignment sorting order.\\\cline{2-3}
  & {\tt SN}* & Reference sequence name. Each {\tt @SQ} line must have a unique {\tt SN} tag. The value of this
  field is used in the
  alignment records in RNAME and PNEXT fields. Regular expression: {\tt [!-)+-\char60\char62-\char126][!-\char126]*}\\\cline{2-3}
  & {\tt LN}* & Reference sequence length. \emph{Range}: {\tt [1,2$^{31}$-1]}\\\cline{2-3}
  & {\tt AS} & Genome assembly identifier. \\\cline{2-3}
  & {\tt M5} & MD5 checksum of the sequence in the uppercase, excluding spaces but including pads (as `*'s).\\\cline{2-3}
  & {\tt SP} & Species.\\\cline{2-3}
  & {\tt UR} & URI of the sequence.  This value may start with one of the standard
  protocols, e.g http: or ftp:.  If it does not start with one of these protocols, it is assumed to be a file-system path.\\\cline{1-3}
  \multicolumn{2}{|l}{\tt @RG} & Read group. Unordered multiple {\tt @RG} lines are allowed.\\\cline{2-3}
  & {\tt ID}* & Read group identifier. Each {\tt @RG} line must have a unique {\tt ID}. The value of {\tt ID}
  is used in the RG tags of alignment records. Must be unique among all read groups in header section.  Read group IDs may be modified when merging SAM files in order to handle collisions.\\\cline{2-3}
  & {\tt CN} & Name of sequencing center producing the read.\\\cline{2-3}
  & {\tt DS} & Description.\\\cline{2-3}
  & {\tt DT} & Date the run was produced (ISO8601 date or date/time).\\\cline{2-3}
  & {\tt FO} & Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each read.
  	Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other characters. \emph{Format}: {\tt /\char92*|[ACMGRSVTWYHKDBN]+/}\\\cline{2-3}
  & {\tt KS} & The array of nucleotide bases that correspond to the key sequence of each read.\\\cline{2-3}
  & {\tt LB} & Library.\\\cline{2-3}
  & {\tt PG} & Programs used for processing the read group.\\\cline{2-3}
  & {\tt PI} & Predicted median insert size.\\\cline{2-3}
  & {\tt PL} & Platform/technology used to produce the reads. \emph{Valid values}:
  {\tt CAPILLARY}, {\tt LS454}, {\tt ILLUMINA}, {\tt SOLID}, {\tt HELICOS}, {\tt IONTORRENT} and {\tt PACBIO}.\\\cline{2-3}
  & {\tt PU} & Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.\\\cline{2-3}
  & {\tt SM} & Sample. Use pool name where a pool is being sequenced.\\\cline{1-3}
  \multicolumn{2}{|l}{\tt @PG} & Program. \\\cline{2-3}
  & {\tt ID}* & Program record identifier. Each {\tt @PG} line must have a unique {\tt ID}.
  	The value of {\tt ID} is used in the alignment {\tt PG} tag and {\tt PP} tags of other {\tt @PG} lines.
	{\tt PG} IDs may be modified when merging SAM files in order to handle collisions.\\\cline{2-3}
  & {\tt PN} & Program name \\\cline{2-3}
  & {\tt CL} & Command line \\\cline{2-3}
  & {\tt PP} & Previous {\tt @PG-ID}. Must match another {\tt @PG} header's {\tt ID} tag.
  	{\tt @PG} records may be chained using {\tt PP} tag, with the last record in the chain
	having no {\tt PP} tag. This chain defines the order of programs that have been applied to the alignment.
	{\tt PP} values may be modified when merging SAM files in order to handle collisions of {\tt PG} {\tt ID}s.
	The first {\tt PG} record in a chain (i.e. the one referred to by the {\tt PG} tag in a SAM record)
	describes the most recent program that operated on the SAM record.
	The next {\tt PG} record in the chain describes the next most recent program that
        operated on the SAM record. The {\tt PG} {\tt ID} on a SAM record is not required
        to refer to the newest {\tt PG} record in a chain.  It may refer to any {\tt PG}
        record in a chain, implying that the SAM record has been operated on by the
        program in that {\tt PG} record, and the program(s) referred to via the {\tt PP} tag. \\\cline{2-3}
  & {\tt VN} & Program version \\\cline{1-3}
  \multicolumn{2}{|l}{\tt @CO} & One-line text comment. Unordered multiple {\tt @CO} lines are allowed.\\
  \cline{1-3}
\end{longtable}
\end{center}

\subsection{The alignment section: mandatory fields}
In the SAM format, each alignment line typically represents the linear
alignment of a segment. Each line has 11
mandatory fields. These fields always appear in the same order and must be
present, but their values can be `0' or `*' (depending on the field) if the
corresponding information is unavailable. The following table gives an overview
of the mandatory fields in the SAM format:
\begin{center}
\small
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Regexp/Range} & {\bf Brief description} \\
  \hline
  1 & {\sf QNAME} & String & {\tt [!-?A-\char126]\{1,255\}} & Query template NAME\\
  2 & {\sf FLAG} & Int & {\tt [0,2$^{16}$-1]} & bitwise FLAG \\
  3 & {\sf RNAME} & String & {\tt \char92*|[!-()+-\char60\char62-\char126][!-\char126]*} & Reference sequence NAME\\
  4 & {\sf POS} & Int & {\tt [0,2$^{31}$-1]} & 1-based leftmost mapping POSition \\
  5 & {\sf MAPQ} & Int & {\tt [0,2$^8$-1]} & MAPping Quality \\
  6 & {\sf CIGAR} & String & {\tt \char92*|([0-9]+[MIDNSHPX=])+} & CIGAR string \\
  7 & {\sf RNEXT} & String & {\tt \char92*|=|[!-()+-\char60\char62-\char126][!-\char126]*} & Ref. name of the mate/next read\\
  8 & {\sf PNEXT} & Int & {\tt [0,2$^{31}$-1]} & Position of the mate/next read \\
  9 & {\sf TLEN} & Int & {\tt [-2$^{31}$+1,2$^{31}$-1]} & observed Template LENgth \\
  10 & {\sf SEQ} & String & {\tt \char92*|[A-Za-z=.]+} & segment SEQuence\\
  11 & {\sf QUAL} & String & {\tt [!-\char126]+} & ASCII of Phred-scaled base QUALity+33 \\
  \hline
\end{tabular}
\end{center}

\begin{enumerate}
\item {\sf QNAME}: Query template NAME. Reads/segments having identical {\sf QNAME}
	are regarded to come from the same template. A {\sf QNAME} `{\tt *}'
	indicates the information is unavailable.  In a SAM file, a read may occupy
	multiple alignment lines, when its alignment is chimeric or when multiple
	mappings are given.
\item {\sf FLAG}: bitwise FLAG. Each bit is explained in the following
  table:
  \begin{center}\small
  \begin{tabular}{rl}
  \hline
  Bit & Description\\
  \hline
  0x1 &  template having multiple segments in sequencing \\
  0x2 &  each segment properly aligned according to the aligner \\
  0x4 &  segment unmapped \\
  0x8 &  next segment in the template unmapped \\
  0x10 &  {\sf SEQ} being reverse complemented \\
  0x20 &  {\sf SEQ} of the next segment in the template being reversed \\
  0x40 &  the first segment in the template \\
  0x80 &  the last segment in the template \\
  0x100 &  secondary alignment \\
  0x200 &  not passing quality controls \\
  0x400 &  PCR or optical duplicate \\
  0x800 &  supplementary alignment \\
  \hline
  \end{tabular}
  \end{center}
  \begin{itemize}
  \item For each read/contig in a SAM file, it is required that one and only
	one line associated with the read satisfies \mbox{`{\sf FLAG} {\tt \& 0x900
	== 0}'}. This line is called the \emph{primary line} of the read.
  \item Bit 0x100 marks the alignment not to be used in certain analyses
    when the tools in use are aware of this bit. It is typically used to
	flag alternative mappings when multiple mappings are presented in a SAM.
  \item Bit 0x800 indicates that the corresponding alignment line is part of
    a chimeric alignment. A line flagged with 0x800 is called as a \emph{supplementary line}.
  \item Bit 0x4 is the only reliable place to tell whether the read
    is unmapped. If 0x4 is set, no assumptions can be made about {\sf
      RNAME}, {\sf POS}, {\sf CIGAR}, {\sf MAPQ}, bits 0x2, 0x10, 0x100
    and 0x800, and the bit 0x20 of the previous read in the template.
  \item If 0x40 and 0x80 are both set, the read is part of a linear
    template, but it is neither the first nor the last read. If both
    0x40 and 0x80 are unset, the index of the read in the template
    is unknown. This may happen for a non-linear template or the index
    is lost in data processing.
  \item If 0x1 is unset, no assumptions can be made about 0x2, 0x8,
    0x20, 0x40 and 0x80.
  \end{itemize}
\item {\sf RNAME}: Reference sequence NAME of the alignment. If {\tt
    @SQ} header lines are present, {\sf RNAME} (if not `*') must be
  present in one of the {\tt SQ-SN} tag. An unmapped segment without
  coordinate has a `*' at this field. However, an unmapped segment may
  also have an ordinary coordinate such that it can be placed at a
  desired position after sorting. If {\sf RNAME} is `*', no assumptions
  can be made about {\sf POS} and {\sf CIGAR}.
\item {\sf POS}: 1-based leftmost mapping POSition of the first matching
  base. The first base in a reference sequence has coordinate 1. {\sf
    POS} is set as 0 for an unmapped read without coordinate. If {\sf
    POS} is 0, no assumptions can be made about {\sf RNAME} and {\sf
    CIGAR}.
\item {\sf MAPQ}: MAPping Quality. It equals
  $-10\log_{10}\Pr\{\mbox{mapping position is wrong}\}$, rounded to the
  nearest integer. A value 255 indicates that the mapping quality is not
  available.
\item {\sf CIGAR}: CIGAR string. The CIGAR operations are given in the
  following table (set `*' if unavailable):
  \begin{center}\small
  \begin{tabular}{ccl}
  \hline
  Op & BAM & Description\\
  \hline
  {\tt M} & 0 & alignment match (can be a sequence match or mismatch)\\
  {\tt I} & 1 & insertion to the reference \\
  {\tt D} & 2 & deletion from the reference \\
  {\tt N} & 3 & skipped region from the reference \\
  {\tt S} & 4 & soft clipping (clipped sequences present in {\sf SEQ})\\
  {\tt H} & 5 & hard clipping (clipped sequences NOT present in {\sf SEQ})\\
  {\tt P} & 6 & padding (silent deletion from padded reference)\\
  {\tt =} & 7 & sequence match \\
  {\tt X} & 8 & sequence mismatch \\
  \hline
  \end{tabular}
  \end{center}
  \begin{itemize}
  \item {\tt H} can only be present as the first and/or last operation.
  \item {\tt S} may only have {\tt H} operations between them and the
    ends of the {\sf CIGAR} string.
  \item For mRNA-to-genome alignment, an {\tt N} operation represents an
    intron. For other types of alignments, the interpretation of {\tt N}
    is not defined.
  \item Sum of lengths of the {\tt M/I/S/=/X} operations shall equal
    the length of {\sf SEQ}.
  \end{itemize}
\item {\sf RNEXT}: Reference sequence name of the primary alignment of the NEXT read in the
  template. For the last read, the next read is the first
  read in the template. If {\tt @SQ} header lines are present, {\sf
    RNEXT} (if not `*' or `=') must be present in one of the {\tt SQ-SN}
  tag. This field is set as `*' when the information is unavailable, and
  set as `=' if {\sf RNEXT} is identical {\sf RNAME}. If not `=' and the
  next read in the template has one primary mapping (see also bit
  0x100 in {\sf FLAG}), this field is identical to {\sf RNAME} at the primary line of the
  next read.  If {\sf
    RNEXT} is `*', no assumptions can be made on {\sf PNEXT} and bit
  0x20.
\item {\sf PNEXT}: Position of the primary alignment of the NEXT read in the template. Set as
  0 when the information is unavailable. This field equals {\sf POS} at the primary line of
  the next read. If {\sf PNEXT} is 0, no assumptions can be made on
  {\sf RNEXT} and bit 0x20.
\item {\sf TLEN}: signed observed Template LENgth. If all segments are
  mapped to the same reference, the unsigned observed template length
  equals the number of bases from the leftmost mapped base to the
  rightmost mapped base. The leftmost segment has a plus sign and the
  rightmost has a minus sign. The sign of segments in the middle is
  undefined. It is set as 0 for single-segment template or when the
  information is unavailable.
\item {\sf SEQ}: segment SEQuence. This field can be a `*' when the
  sequence is not stored. If not a `*', the length of the sequence must
  equal the sum of lengths of {\tt M/I/S/=/X} operations in {\sf CIGAR}.
  An `=' denotes the base is identical to the reference base. No
  assumptions can be made on the letter cases.
\item {\sf QUAL}: ASCII of base QUALity plus 33 (same as the quality
  string in the Sanger FASTQ format). A base quality is the phred-scaled
  base error probability which equals $-10\log_{10}\Pr\{\mbox{base is
    wrong}\}$. This field can be a `*' when quality is not stored. If
  not a `*', {\sf SEQ} must not be a `*' and the length of the quality string
  ought to equal the length of {\sf SEQ}.
\end{enumerate}

\subsection{The alignment section: optional fields}
All optional fields follow the {\tt TAG:TYPE:VALUE} format
where {\tt TAG} is a two-character string that matches {\tt /[A-Za-z][A-Za-z0-9]/}.
Each {\tt TAG} can only appear once in one alignment line. A {\tt TAG}
containing lowercase letters are reserved for end users.
In an optional field, {\tt TYPE} is a single case-sensitive letter which
defines the format of {\tt VALUE}:
\begin{center}\small
\begin{tabular}{cll}
\hline
{\bf Type} & {\bf Regexp matching {\tt VALUE}} & {\bf Description} \\
\hline
A & {\tt [!-\char126]} & Printable character \\
i & {\tt [-+]?[0-9]+} & Singed 32-bit integer \\
f & {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?} & Single-precision floating number \\
Z & {\tt [\,\,\,!-\char126]+} & Printable string, including space\\
H & {\tt [0-9A-F]+} & Byte array in the Hex format\footnotemark[2]\\
B & {\tt [cCsSiIf](,[-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?)+} & Integer or numeric array\\
\hline
\end{tabular}
\footnotetext[2]{For example, a byte array {\tt \{0x1a,0xe3,0x1\}} corresponds to a Hex string `{\tt 1AE301}'.}
\end{center}
For an integer or numeric array (type `{\tt B}'), the first letter indicates the type of numbers
in the following comma separated array. The letter can be one of `{\tt cCsSiIf}', corresponding to
{\tt int8\_t} (signed 8-bit integer), {\tt uint8\_t} (unsigned 8-bit integer), {\tt int16\_t}, {\tt uint16\_t}, {\tt int32\_t}, {\tt uint32\_t}
and {\tt float}, respectively\footnotemark[3]. During import/export, the element type
may be changed if the new type is also compatible with the array.
\footnotetext[3]{Explicit typing eases format parsing and helps to reduce the file size when SAM is converted to BAM.}

{Predefined tags are shown in the following table. You can
  freely add new tags, and if a new tag may be of general interest, you
  can email {\tt samtools-devel@lists.sourceforge.net} to add the new tag
  to the specification. Note that tags starting with `{\tt X}', `{\tt Y}'
  and `{\tt Z}' or tags containing lowercase letters in either position
  are reserved for local use and will not be formally
  defined in any future version of this specification.}
\begin{center}\small
\begin{longtable}{ccp{12.5cm}}
  \hline
  {\bf Tag\footnotemark[4]} & {\bf Type} & {\bf Description} \\
  \hline
  {\tt X?} & ? & Reserved fields for end users (together with {\tt Y?} and {\tt Z?}) \\
  {\tt AM} & i & The smallest template-independent mapping quality of segments in the rest \\
  {\tt AS} & i & Alignment score generated by aligner \\
  {\tt BC} & Z & Barcode sequence, with any quality scores stored in the {\tt QT} tag. \\
  {\tt BQ} & Z & Offset to base alignment quality (BAQ), of the same length as the read sequence.
  At the $i$-th read base, ${\rm BAQ}_i=Q_i-({\rm BQ}_i-64)$ where $Q_i$ is the $i$-th base quality. \\
  {\tt CC} & Z & Reference name of the next hit; "=" for the same chromosome \\
  {\tt CM} & i & Edit distance between the color sequence and the color reference (see also {\tt NM})\\
  {\tt CO} & Z & Free-text comments \\
  {\tt CP} & i & Leftmost coordinate of the next hit \\ 
  {\tt CQ} & Z & Color read quality on the original strand of the read. Same encoding as {\sf QUAL}; same length as {\tt CS}.\\
  {\tt CS} & Z & Color read sequence on the original strand of the read. The primer base must be included.\\
  {\tt CT} & Z & Complete read annotation tag, used for consensus annotation dummy features\footnotemark[5].\\
  {\tt E2} & Z & The 2nd most likely base calls. Same encoding and same length as {\sf QUAL}.\\
  {\tt FI} & i & The index of segment in the template.\\
  {\tt FS} & Z & Segment suffix.\\
  {\tt FZ} & B,S & Flow signal intensities on the original strand of the read, stored as {\tt (uint16\_t) round(value * 100.0)}. \\
  {\tt LB} & Z & Library. Value to be consistent with the header {\tt RG-LB} tag if {\tt @RG} is present.\\
  {\tt H0} & i & Number of perfect hits\\
  {\tt H1} & i & Number of 1-difference hits (see also {\tt NM})\\
  {\tt H2} & i & Number of 2-difference hits \\
  {\tt HI} & i & Query hit index, indicating the alignment record is the i-th one stored in SAM\\
  {\tt IH} & i & Number of stored alignments in SAM that contains the query in the current record\\
  {\tt MD} & Z & String for mismatching positions. \emph{Regex}: {\tt [0-9]+(([A-Z]|\char92\char94[A-Z]+)[0-9]+)*}\footnotemark[6]\\
  {\tt MQ} & i & Mapping quality of the mate/next segment \\
  {\tt NH} & i & Number of reported alignments that contains the query in the current record\\
  {\tt NM} & i & Edit distance to the reference, including ambiguous bases but excluding clipping\\
  {\tt OQ} & Z & Original base quality (usually before recalibration). Same encoding as {\sf QUAL}.\\
  {\tt OP} & i & Original mapping position (usually before realignment) \\
  {\tt OC} & Z & Original CIGAR (usually before realignment) \\
  {\tt PG} & Z & Program. Value matches the header {\tt PG-ID} tag if {\tt @PG} is present. \\
  {\tt PQ} & i & Phred likelihood of the template, conditional on both the mapping being correct \\
  {\tt PT} & Z & Read annotations for parts of the padded read sequence\footnotemark[7]\\
  {\tt PU} & Z & Platform unit. Value to be consistent with the header {\tt RG-PU} tag if {\tt @RG} is present.\\
  {\tt QT} & Z & Phred quality of the barcode sequence in the {\tt BC} (or {\tt RT}) tag. Same encoding as {\sf QUAL}. \\
  {\tt Q2} & Z & Phred quality of the mate/next segment sequence in the {\tt R2} tag. Same encoding as {\sf QUAL}.\\
  {\tt R2} & Z & Sequence of the mate/next segment in the template. \\
  {\tt RG} & Z & Read group. Value matches the header {\tt RG-ID} tag if {\tt @RG} is present in the header. \\
  {\tt RT} & Z & Deprecated alternative to {\tt BC} tag originally used at Sanger. \\
  {\tt SA} & Z & Other canonical alignments in a chimeric alignment, in the format of: (\emph{rname},\emph{pos},\emph{strand},\emph{CIGAR},\emph{mapQ},\emph{NM};)+.
    Each element in the semi-colon delimited list represents a part of the chimeric alignment. Conventionally, at a supplementary line,
	the first element points to the primary line.\\
  {\tt SM} & i & Template-independent mapping quality \\
  {\tt TC} & i & The number of segments in the template.\\
  {\tt U2} & Z & Phred probility of the 2nd call being wrong conditional on the best being wrong. The same encoding as {\sf QUAL}. \\
  {\tt UQ} & i & Phred likelihood of the segment, conditional on the mapping being correct \\
  \hline
\end{longtable}
\end{center}
\footnotetext[4]{The {\tt GS}, {\tt GC}, {\tt GQ}, {\tt MF}, {\tt S2}
  and {\tt SQ} are reserved for backward compatibility.}
\footnotetext[5]{The MD field aims to achieve SNP/indel calling without looking at
  the reference. For example, a string `{\tt 10A5\char94AC6}' means from
  the leftmost reference base in the alignment, there are 10 matches
  followed by an A on the reference which is different from the aligned
  read base; the next 5 reference bases are matches followed by a 2bp
  deletion from the reference; the deleted sequence is AC; the last 6
  bases are matches. The {\tt MD} field ought to match the {\sf CIGAR}
  string.}
\footnotetext[6]{The {\tt CT} tag is intended primarily for annotation
dummy reads, and consists of a \emph{strand}, \emph{type} and zero or
more \emph{key}=\emph{value} pairs, each separated with semicolons.
The \emph{strand} field has four values as in GFF3, and supplements FLAG
bit 0x10 to allow unstranded (`{\tt .}'), and stranded but unknown strand
(`{\tt ?}') annotation. For these and annotation on the forward strand
(\emph{strand} set to `{\tt +}'), do not set FLAG bit 0x10. For
annotation on the reverse strand, set the \emph{strand} to `{\tt -}'
and set FLAG bit 0x10. The \emph{type} and any \emph{keys} and their
optional \emph{values} are all percent encoded according to
RFC3986 to escape meta-characters `{\tt =}', `{\tt \%}', `{\tt ;}',
`{\tt |}' or non-printable characters not matched by the isprint()
macro (with the C locale). For example a percent sign becomes
`{\tt \%2C}'. The CT record matches:
``{\tt \emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*}''.
%NOTE - This leaves open the possibility of allowing multiple such
%entries for a single CT tag to be combined with | as in the PT tag.
}%End of CT tag footnote
\footnotetext[7]{The {\tt PT} tag value has the format of a series of
tags separated by {\tt |}, each annotating a sub-region of the read.
Each tag consists of \emph{start}, \emph{end}, \emph{strand},
\emph{type} and zero or more \emph{key}=\emph{value} pairs, each
separated with semicolons. \emph{Start} and \emph{end} are 1-based
positions between one and the sum of the {\tt M/I/D/P/S/=/X}
{\sf CIGAR} operators, i.e. {\sf SEQ} length plus any pads.  Note
any editing of the CIGAR string may require updating the `{\tt PT}'
tag coordinates, or even invalidate them.
As in GFF3, \emph{strand} is one of `{\tt +}' for forward strand tags,
`{\tt -}' for reverse strand, `{\tt .}' for unstranded or `{\tt ?}'
for stranded but unknown strand.
The \emph{type} and any \emph{keys} and their optional \emph{values}
are all percent encoded as in the {\tt CT} tag.
Formally the entire PT record matches:
 ``{\tt \emph{start};\emph{end};\emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*(\char92|\emph{start};\emph{end};\emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*)*}''.
 }%End of PT tag footnote


\pagebreak

\section{Recommended Practice for the SAM Format}
\label{sec-recommended-practice}
This section describes the best practice for representing data in the
SAM format. They are not required in general, but may be required by a
specific software package for it to function properly.

\begin{enumerate}
\item The header section
  \begin{enumerate}[label=\arabic*]
  \item The {\tt @HD} line should be present with the {\tt SO} tag specified.
  \item The {\tt @SQ} lines should be present if reads have been mapped.
  \item When a {\tt RG} tag appears anywhere in the alignment section,
    there should be a single corresponding {\tt @RG} line with matching
    {\tt ID} tag in the header.
  \item When a {\tt PG} tag appears anywhere in the alignment section,
    there should be a single corresponding {\tt @PG} line with matching
    {\tt ID} tag in the header.
  \end{enumerate}
\item Adjacent CIGAR operations should be different.
\item No alignments should be assigned mapping quality 255.
\item Unmapped reads
  \begin{enumerate}[label=\arabic*]
  \item For a unmapped paired-end or mate-pair read whose mate is
    mapped, the unmapped read should have {\sf RNAME} and {\sf POS}
    identical to its mate.
  \item If all segments in a template are unmapped, their {\sf RNAME}
    should be set as `*' and {\sf POS} as 0.
  \item If {\sf POS} plus the sum of lengths of {\tt M/=/X/D/N}
    operations in {\sf CIGAR} exceeds the length specified in the {\tt
      LN} field of the {\tt @SQ} header line (if exists) with an SN
    equal to {\sf RNAME}, the alignment should be unmapped.
  \end{enumerate}
\item Multiple mapping
  \begin{enumerate}[label=\arabic*]
  \item When one segment is present in multiple lines to represent a multiple
	mapping of the segment, only one of these records should have the secondary
	alignment flag bit (0x100) unset. {\sf RNEXT} and {\sf PNEXT} point to the
	primary line of the next read in the template.
  \item {\sf SEQ} and {\sf QUAL} of secondary alignments should be set
    to `*' to reduce the file size.
  \end{enumerate}
%\item There should be no overlap between segments of a read\footnote{Few/no
%  existing aligners follow this practice.}.
\item Optional tags:
  \begin{enumerate}[label=\arabic*]
  \item If the template has more than 2 segments, the {\tt TC} tag
    should be present.
  \item The {\tt NM} tag should be present.
  \end{enumerate}
\item Annotation dummy reads:
  These have {\sf SEQ} set to {\tt *}, {\sf FLAG} bits 0x100 and 0x200
  set (secondary and filtered), and a {\tt CT} tag.
  \begin{enumerate}[label=\arabic*]
%Repeating what is in the tag's footnote:
%  \item If the {\tt CT} tag's \emph{strand} is {\tt -}, FLAG bit 0x10
%  (reverse complemented) should be set, and otherwise not set.
  \item If you wish to store free text in a {\tt CT} tag, use the
  \emph{key} value {\tt Note} (uppercase N) to match GFF3.
  \item Multi-segment annotation (e.g. a gene with introns) should be
  described with multiple lines in SAM (like a multi-segment read).
  Where there is a clear biological direction (e.g. a gene), the first
  segment ({\sf FLAG} bit 0x40) is used for the first section (e.g. the
  $5'$ end of the gene). Thus a GenBank entry location like
  {\tt complement(join(85052..85354,} {\tt 85441..85621,} {\tt 6097..86284))}
  would have three lines in SAM with a common {\sf QNAME}: %And three lines in GFF3 too.
    \begin{enumerate}
    \item The $5'$ fragment {\sf FLAG}  883, {\sf POS} 86097, {\sf CIGAR} {\tt 188M}, and tags {\tt FI:i:1} and {\tt TC:i:3} %FLAG = 0x1 + 0x2 + 0x10 + 0x20 + 0x40 + 0x100 + 0x200
    \item Middle fragment {\sf FLAG} 819, {\sf POS} 85441, {\sf CIGAR} {\tt 181M}, and tags {\tt FI:i:2} and {\tt TC:i:3} %FLAG = 0x1 + 0x2 + 0x10 + 0x20 + 0x100 + 0x200
    \item The $3'$ fragment {\sf FLAG} 947, {\sf POS} 85052, {\sf CIGAR} {\tt 303M}, and tags {\tt FI:i:3} and {\tt TC:i:3} %FLAG = 0x1 + 0x2 + 0x10 + 0x20 + 0x80 + 0x100 + 0x200
    \end{enumerate}
  \item If converting GFF3 to SAM, store any \emph{key}, \emph{values}
  from column 9 in the {\tt CT} tag, except for the unique ID which
  is used for the QNAME. GFF3 columns 1 (seqid), 4 (start) and 5 (end)
  are encoded using SAM columns RNAME, POS and CIGAR to hold the length.
  GFF3 columns 3 (type) and 7 (strand) are stored explicitly in the
  {\tt CT} tag. Remaining GFF3 columns 2 (source), 6 (score), and
  8 (phase) are stored in the {\tt CT} tag using \emph{key} values
  {\tt FSource}, {\tt FScore} and {\tt FPhase} (uppercase keys are
  restricted in GFF3, so these names avoid clashes). Split location
  features are described with multiple lines in GFF3, and similarly
  become multi-segment dummy reads in SAM, with the {\sf RNEXT} and
  {\sf PNEXT} columns filled in appropriately. In the absence of a
  convention in SAM/BAM for reads wrapping the origin of a circular
  genome, any GFF3 feature line wrapping the origin must be split into
  two segments in SAM.
  \end{enumerate}
\end{enumerate}

\pagebreak

\section{Guide for Describing Assembly Sequences in SAM}

\subsection{Unpadded versus padded representation}

To describe alignments, we can regard the reference sequence with no respect to
other alignments against it.  Such a reference sequence is called an
\emph{unpadded reference}. A position on an unpadded reference, referred to
as an \emph{unpadded position}, is not affected by any alignments. When we use
unpadded references and positions to describe alignments, we say we are using
the \emph{unpadded representation}.

Alternatively, to describe the same alignments, we can modify the reference
sequence to contain pads that make room for sequences inserted relative to the
reference. A pad is effectively a gap and conventionally represented by an
asterisk `*'.  A reference sequence containing pads is called a \emph{padded
reference}. A position which counts the *'s is referred to as a \emph{padded
position}. A padded reference sequence may be affected by the query alignments
and because of gap insertions is typically longer than the unpadded reference.
The padded position of one query alignment may be affected by other query
alignments.

Unpadded and padded are different representations of the same alignments. They
are convertible to each other with no loss of any information. The unpadded
representation is more common due to the convenience of a fixed coordinate system,
while the padded representation has the advantage that alignments can be simply
described by the start and end coordinates without using complex CIGAR strings.
SAM traditionally uses the padded representation for {\it de novo} assembly.
The ACE assembly format uses the padded representation exclusively.

\subsection{Padded SAM}

The SAM format is typically used to describe alignments against an unpadded
reference sequence, but it is also able to describe alignments against a padded
reference. In the latter case, we say we are using a \emph{padded SAM}. A padded
SAM is a valid SAM, but with the difference that the reference and positions in
use are padded. There may be more than one way to describe the padded
representation. We recommend the following.

In a padded SAM, alignments and coordinates are described with respect to the
padded reference sequence. Unlike traditional padded representations like
the ACE file format where pads/gaps are recorded in reads using *'s, we do
not write *'s in the {\sf SEQ} field of the SAM format\footnote{Writing
pads/gaps as *'s in the {\sf SEQ} field might have been more convenient, but
this caused concerns for backward compatibility.}. Instead, we describe pads
in the query sequences as deletions from the padded reference using the
{\sf CIGAR} `{\tt D}' operation. In a padded SAM, the insertion and padding
CIGAR operations (`{\tt I}' and `{\tt P}') are not used because the padded
reference already considers all the insertions.

The following shows the padded SAM for the example alignment in Section~\ref{sec:example}.
Notably, the length of {\tt ref} is 47 instead of 45. {\sf POS} of the last
three alignments are all shifted by 2. {\sf CIGAR} of alignments bridging the
2bp insertion are also changed.

\begin{framed}\small
\begin{verbatim}
@HD VN:1.3 SO:coordinate
@SQ SN:ref LN:47
ref  516 ref  1  0 14M2D31M   *  0   0 AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT *
r001 163 ref  7 30 14M1D3M    = 39  41 TTAGATAAAGGATACTG *
*    768 ref  8 30 1M         *  0   0 *                 *  CT:Z:.;Warning;Note=Ref wrong?
r002   0 ref  9 30 3S6M1D5M   *  0   0 AAAAGATAAGGATA    *  PT:Z:1;4;+;homopolymer
r003   0 ref  9 30 5H6M       *  0   0 AGCTAA            *  NM:i:1
r004   0 ref 18 30 6M14N5M    *  0   0 ATAGCTTCAGC       *
r003  16 ref 31 30 6H5M       *  0   0 TAGGC             *  NM:i:0
r001  83 ref 39 30 9M         =  7 -41 CAGCGCCAT         *
\end{verbatim}
\end{framed}

Here we also exemplify the recommended practice for storing the reference
sequence and the reference annotations in SAM when necessary. For a reference
sequence in SAM, {\sf QNAME} should be identical to {\sf RNAME}, {\sf POS} set
to 1 and {\sf FLAG} to 516 (filtered and unmapped); for an annotation, {\sf
FLAG} should be set to 768 (filtered and secondary) with no restriction to {\sf
QNAME}. Dummy reads for annotation would typically have an `{\tt CT}' tag to
hold the annotation information, see Section~\ref{sec-recommended-practice}.

\pagebreak

\section{The BAM Format Specification}

\subsection{The BGZF compression format}

BGZF is block compression implemented on top of the standard gzip file
format. The goal of BGZF is to provide good compression while allowing
efficient random access to the BAM file for indexed queries. The BGZF
format is `gunzip compatible', in the sense that a compliant gunzip
utility can decompress a BGZF compressed file\footnote{It is worth noting that there is a known bug in the Java {\sf
  GZIPInputStream} class that concatenated gzip archives cannot be
successfully decompressed by this class. BGZF files can be created and
manipulated using the built-in Java {\sf util.zip} package, but naive
use of {\sf GZIPInputStream} on a BGZF file will not work due to this
bug.}.

A BGZF archive is a series of concatenated BGZF blocks. Each BGZF block
is itself a spec-compliant gzip archive which contains an "extra field"
in the format described in RFC1952. The gzip file format allows the
inclusion of application-specific extra fields and these are ignored by
compliant decompression implementation. The gzip specification also
allows gzip files to be concatenated. The result of decompressing
concatenated gzip files is the concatenation of the uncompressed data.

Each BGZF block contains a standard gzip file header with the following
standard-compliant extensions:

\begin{enumerate}
\item The {\sf F.EXTRA} bit in the header is set to indicate that extra
  fields are present.
\item The extra field used by BGZF uses the two subfield ID values 66 and 67 (ascii `BC').
\item The length of the BGZF extra field payload (field {\sf LEN} in the
  gzip specification) is 2 (two bytes of payload).
\item The payload of the BGZF extra field is a 16-bit unsigned integer
  in little endian format. This integer gives the size of the containing
  BGZF block minus one.
\end{enumerate}

On disk, a full BGZF file is (all integers are little endian as is
required by RFC1952):
\begin{table}[ht]
\centering
{\small
\begin{tabular}{|l|l|l|l|l|r|}
  \cline{1-6}
  \multicolumn{3}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-6}
  \multicolumn{6}{|c|}{\textcolor{gray}{\it List of compression blocks (until the end of the file)}} \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf ID1} & gzip IDentifier1 & {\tt uint8\_t} & 31 \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf ID2} & gzip IDentifier2 & {\tt uint8\_t} & 139 \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf CM} & gzip Compression Method & {\tt uint8\_t} & 8 \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf FLG} & gzip FLaGs & {\tt uint8\_t} & 4 \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf MTIME} & gzip Modification TIME & {\tt uint32\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf XFL} & gzip eXtra FLags & {\tt uint8\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf OS} & gzip Operating System & {\tt uint8\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf XLEN} & gzip eXtra LENgth & {\tt uint16\_t} & \\\cline{2-6}
  & \multicolumn{5}{c|}{\textcolor{gray}{\it Extra subfield(s) (total size=XLEN)}} \\\cline{3-6}
  & & \multicolumn{4}{c|}{\textcolor{gray}{\it Additional RFC1952 extra subfields if present}} \\\cline{3-6}
  & & {\sf SI1} & Subfield Identifier1 & {\tt uint8\_t} & 66 \\\cline{3-6}
  & & {\sf SI2} & Subfield Identifier2 & {\tt uint8\_t} & 67 \\\cline{3-6}
  & & {\sf SLEN} & Subfield LENgth & {\tt uint16\_t} & 2 \\\cline{3-6}
  & & {\sf BSIZE} & total Block SIZE minus 1 & {\tt uint16\_t} & \\\cline{3-6}
  & & \multicolumn{4}{c|}{\textcolor{gray}{\it Additional RFC1952 extra subfields if present}} \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf CDATA} & Compressed DATA by {\sf zlib::deflate()} & {\tt uint8\_t[{\sf BSIZE-XLEN-19}]} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf CRC32} & CRC-32 & {\tt uint32\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf ISIZE} & Input SIZE (length of uncompressed data) & {\tt uint32\_t} & \\
  \cline{1-6}
\end{tabular}}
\end{table}

BGZF files support random access through the BAM file index. To achieve
this, the BAM file index uses \emph{virtual file offsets} into the BGZF
file. Each virtual file offset is an unsigned 64-bit integer, defined as: {\tt
  coffset\char60\char60 16\char124uoffset}, where {\tt coffset} is an
unsigned byte offset into the BGZF file to the beginning of a BGZF
block, and {\tt uoffset} is an unsigned byte offset into the
uncompressed data stream represented by that BGZF block. Virtual file
offsets can be compared, but subtraction between virtual file offsets
and addition between a virtual offset and an integer are both
disallowed.

\subsection{The BAM format}
BAM is compressed in the BGZF format. All multi-byte numbers in BAM are
little-endian, regardless of the machine endianness. The format is
formally described in the following table where values in brackets are
the default when the corresponding information is not available; an
underlined word in uppercase denotes a field in the SAM format.

\begin{table}[ht]
\centering
{\small
\begin{tabular}{|l|l|l|p{8.15cm}|l|r|}
  \cline{1-6}
  \multicolumn{3}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-6}
  \multicolumn{3}{|l|}{\sf magic} & BAM magic string & {\tt char[4]} & {\tt BAM\char92 1}\\\cline{1-6}
  \multicolumn{3}{|l|}{\sf l\_text} & Length of the header text, including any {\tt NULL} padding & {\tt int32\_t} & \\\cline{1-6}
  \multicolumn{3}{|l|}{\sf text} & Plain header text in SAM; not necessarily {\tt NULL} terminated & {\tt char[{\sf l\_text}]} & \\\cline{1-6}
  \multicolumn{3}{|l|}{\sf n\_ref} & \# reference sequences & {\tt int32\_t} & \\\cline{1-6}
  \multicolumn{6}{|c|}{\textcolor{gray}{\it List of reference information (n=n\_ref)}} \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf l\_name} & Length of the reference name plus 1 (including {\tt NULL}) & {\tt int32\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf name} & Reference sequence name; {\tt NULL} terminated & {\tt char[{\sf l\_name}]} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf l\_ref} & Length of the reference sequence & {\tt int32\_t} & \\\cline{1-6}
  \multicolumn{6}{|c|}{\textcolor{gray}{\it List of alignments (until the end of the file)}} \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf block\_size} & Length of the remainder of the alignment record & {\tt int32\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf refID} & Reference sequence ID, $-1\leq{\sf refID}<{\sf n\_ref}$; -1 for a read without a mapping position. & {\tt int32\_t} & [-1] \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf pos} & 0-based leftmost coordinate ($=\underline{\sf POS}-1$)& {\tt int32\_t} & [-1]\\\cline{2-6}
  & \multicolumn{2}{l|}{\sf bin\_mq\_nl} & {\tt{\sf bin}\char60\char60 16\char124\underline{\sf MAPQ}\char60\char60 8\char124{\sf l\_read\_name}}; {\sf bin} is computed by the {\sf reg2bin()} function in Section~\ref{sec:code}; {\sf l\_read\_name} is the length of {\sf read\_name} below ($={\sf length}(\underline{\sf QNAME})+1$). & {\tt uint32\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf flag\_nc} & {\tt \underline{\sf FLAG}\char60\char60 16\char124{\sf n\_cigar\_op}}; {\sf n\_cigar\_op} is the number of operations in \underline{\sf CIGAR}. & {\tt uint32\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf l\_seq} & Length of \underline{\sf SEQ} & {\tt int32\_t} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf next\_refID} & Ref-ID of the next segment ($-1\le{\sf mate\_refID}<{\sf n\_ref}$) & {\tt int32\_t} & [-1] \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf next\_pos} & 0-based leftmost pos of the next segment ($=\underline{\sf PNEXT}-1$) & {\tt int32\_t} & [-1] \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf tlen} & Template length ($=\underline{\sf TLEN}$) & {\tt int32\_t} & [0] \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf read\_name} & Read name\footnotemark[1], {\tt NULL} terminated (\underline{\sf QNAME} plus a tailing `{\tt \char92 0}') & {\tt char[{\sf l\_read\_name}]} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf cigar} & CIGAR: {\tt {\sf op\_len}\char60\char60 4\char124{\sf op}}. `{\tt MIDNSHP\char61X}'$\to$`012345678' & {\tt uint32\_t[{\sf n\_cigar\_op}]} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf seq} & 4-bit encoded read: `{\tt =ACMGRSVTWYHKDBN}'$\to[0,15]$; other characters mapped to `{\tt N}'; high nybble first (1st base in the highest 4-bit of the 1st byte) & {\tt uint8\_t[({\sf l\_seq}+1)/2]} & \\\cline{2-6}
  & \multicolumn{2}{l|}{\sf qual} & Phred base quality (a sequence of {\tt 0xFF} if absent) & {\tt char[{\sf l\_seq}]} & \\\cline{2-6}
  & \multicolumn{5}{c|}{\textcolor{gray}{\it List of auxiliary data (until the end of the alignment block)}} \\\cline{3-6}
  & & {\sf tag} & Two-character tag & {\tt char[2]} & \\\cline{3-6}
  & & {\sf val\_type} & Value type: {\tt AcCsSiIfZHB}\footnotemark[2]$^,$\footnotemark[3] & {\tt char} & \\\cline{3-6}
  & & {\sf value} & Tag value & (by {\sf val\_type}) &\\
  \cline{1-6}
\end{tabular}}
\end{table}
\footnotetext[1]{For backward compatibility, a {\sf QNAME} `{\tt *}' is stored as a C string {\tt "*\char92 0"}.}
\footnotetext[2]{An integer may be stored as one of `{\tt cCsSiI}' in BAM, representing {\tt int8\_t}, {\tt uint8\_t},
	{\tt int16\_t}, {\tt uint16\_t}, {\tt int32\_t} and {\tt uint32\_t}, respectively. In SAM, all single integer types are mapped to {\tt int32\_t}.}
\footnotetext[3]{A `{\tt B}'-typed (array) tag--value pair is stored as follows. The first two bytes keep the two-character tag. The 3rd byte is always `{\tt B}'.
	The 4th byte, matching {\tt /\char94[cCsSiIf]\$/}, indicates the type of an element in the array.
	Bytes from 5 to 8 encode a little-endian 32-bit integer which gives the number of elements in the array.
	Bytes starting from the 9th store the array in the little-endian byte order; the number of these
	bytes is determined by the type and the length of the array.}

\pagebreak

\section{Indexing BAM}
Indexing aims to achieve fast retrieval of alignments overlapping a
specified region without going through the whole alignments. BAM must be
sorted by the reference ID and then the leftmost coordinate before
indexing.

\subsection{Algorithm}
\subsubsection{Basic binning index}
The UCSC binning scheme was suggested by Richard Durbin and Lincoln
Stein and is explained by Kent et al. (2002). In this scheme, each bin
represents a contiguous genomic region which is either fully contained
in or non-overlapping with another bin; each alignment is associated
with a bin which represents the smallest region containing the entire
alignment. The binning scheme is essentially a representation of
R-tree. A distinct bin uniquely corresponds to a distinct internal node
in a R-tree. Bin A is a child of Bin B if the region represented by A is
contained in B.

To find the alignments that overlap a specified region, we need to get
the bins that overlap the region, and then test each alignment in the
bins to check overlap. To quickly find alignments associated with a
specified bin, we can keep in the index the start file offsets of chunks
of alignments which all have the bin. As alignments are sorted by the
leftmost coordinates, alignments having the same bin tend to be
clustered together on the disk and therefore usually a bin is only
associated with a few chunks. Traversing all the alignments having the
same bin usually needs a few seek calls. Given the set of bins that
overlap the specified region, we can visit alignments in the order of
their leftmost coordinates and stop seeking the rest when an alignment
falls outside the required region. This strategy saves half of the seek
calls in average.

In BAM, each bin may span $2^{29}$, $2^{26}$, $2^{23}$, $2^{20}$,
$2^{17}$ or $2^{14}$ bp\footnote{Due to a limitation in the current indexing
scheme, a chromosome sequence longer than $2^{29}-1$ is not supported during
indexing.}. Bin 0 spans a 512Mbp region, bins 1--8 span 64Mbp, 9--72 8Mbp,
73--584 1Mbp, 585--4680 128Kbp and bins 4681--37449
span 16Kbp regions.

\subsubsection{Reducing small chunks}
Around the boundary of two adjacent bins, we may see many small chunks
with some having a shorter bin while the rest having a larger bin. To
reduce the number of seek calls, we may join two chunks having the same
bin if they are close to each other. After this process, a joined chunk
will contain alignments with different bins. We need to keep in the
index the file offset of the end of each chunk to identify its
boundaries.

\subsubsection{Combining with linear index}
For an alignment starting beyond 64Mbp, we always need to seek to some
chunks in bin 0, which can be avoided by using a linear index. In the
linear index, for each tiling 16384bp window on the reference, we record
the smallest file offset of the alignments that start in the
window. Given a region [rbeg,rend), we only need to visit a chunk whose
end file offset is larger than the file offset of the 16kbp window
containing rbeg.

With both binning and linear indices, we can retrieve alignments in most
of regions with just one seek call.

\subsubsection{A conceptual example}
Suppose we have a genome shorter than 144kbp. we can design a binning
scheme which consists of three types of bins: bin 0 spans 0-144kbp, bin
1, 2 and 3 span 48kbp and bins from 4 to 12 span 16kbp each:

\begin{table}[ht]
  \centering
  {\small\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
    \cline{1-9}
    \multicolumn{9}{|c|}{0 (0--144kbp)}\\\cline{1-9}
    \multicolumn{3}{|c|}{1 (0--48kbp)} & \multicolumn{3}{c|}{2 (48--96kbp)} & \multicolumn{3}{c|}{1 (96--144kbp)} \\\cline{1-9}
    4 (0--16k) & 5 (16--32k) & 6 (32--48k) & 7 (48--64k) & 8 (64--80k) & 9 (80--96k) & 10 & 11 & 12 \\
    \cline{1-9}
  \end{tabular}}
\end{table}

An alignment starting at 65kbp and ending at 67kbp would have a bin
number 8, which is the smallest bin containing the alignment. Similarly,
an alignment starting at 51kbp and ending at 70kbp would go to bin 2,
while an alignment between [40k,49k] to bin 0. Suppose we want to find
all the alignments overlapping region [65k,71k). We first calculate that
bin 0, 2 and 8 overlap with this region and then traverse the alignments
in these bins to find the required alignments. With a binning index
alone, we need to visit the alignment at [40k,49k] as it belongs to bin
0. But with a linear index, we know that such an alignment stops before
64kbp and cannot overlap the specified region. A seek call can thus be
saved.

\subsection{The BAM indexing format}
\begin{table}[ht]
{\small
\begin{tabular}{|l|l|l|l|l|l|r|}
  \cline{1-7}
  \multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Description} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\cline{1-7}
  \multicolumn{4}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BAI\char92 1}\\\cline{1-7}
  \multicolumn{4}{|l|}{\sf n\_ref} & \# reference sequences & {\tt int32\_t} & \\\cline{1-7}
  \multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}} \\\cline{2-7}
  & \multicolumn{3}{l|}{\sf n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\\cline{2-7}
  & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\\cline{3-7}
  & & \multicolumn{2}{l|}{\sf bin} & Distinct bin & {\tt uint32\_t} & \\\cline{3-7}
  & & \multicolumn{2}{l|}{\sf n\_chunk} & \# chunks & {\tt int32\_t} & \\\cline{3-7}
  & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\\cline{4-7}
  & & & {\sf chunk\_beg} & (Virtual) file offset of the start of the chunk & {\tt uint64\_t} & \\\cline{4-7}
  & & & {\sf chunk\_end} & (Virtual) file offset of the end of the chunk & {\tt uint64\_t} & \\\cline{2-7}
  & \multicolumn{3}{l|}{\sf n\_intv} & \# 16kbp intervals (for the linear index) & {\tt int32\_t} & \\\cline{2-7}
  & \multicolumn{6}{c|}{\textcolor{gray}{\it List of intervals (n=n\_intv)}} \\\cline{3-7}
  & & \multicolumn{2}{l|}{\sf ioffset} & (Virtual) file offset of the first alignment in the interval & {\tt uint64\_t} & \\
  \cline{1-7}
\end{tabular}}
\end{table}

\subsection{C source code for computing bin number and overlapping bins}\label{sec:code}

{\small
\begin{verbatim}
/* calculate bin given an alignment covering [beg,end) (zero-based, half-close-half-open) */
int reg2bin(int beg, int end)
{
    --end;
    if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
    if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
    if (beg>>20 == end>>20) return ((1<<9)-1)/7  + (beg>>20);
    if (beg>>23 == end>>23) return ((1<<6)-1)/7  + (beg>>23);
    if (beg>>26 == end>>26) return ((1<<3)-1)/7  + (beg>>26);
    return 0;
}
/* calculate the list of bins that may overlap with region [beg,end) (zero-based) */
#define MAX_BIN (((1<<18)-1)/7)
int reg2bins(int beg, int end, uint16_t list[MAX_BIN])
{
    int i = 0, k;
    --end;
    list[i++] = 0;
    for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
    for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
    for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
    for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
    for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
    return i;
}
\end{verbatim}
}

\end{document}