Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Iteration over SAM header lines and header expression filtering #1351

Draft
wants to merge 2 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ hts-object-files: $(LIBHTS_OBJS)
bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h)
errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h)
kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h)
header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h)
header.o header.pico: header.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) $(header_h)
hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h)
hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h)
hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h)
Expand Down
159 changes: 158 additions & 1 deletion header.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#include <assert.h>
#include <errno.h>
#include "htslib/hts_expr.h"
#include "textutils_internal.h"
#include "header.h"

Expand Down Expand Up @@ -898,6 +899,84 @@ static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len
return 0;
}

static int sam_hrecs_tag_lookup(sam_hrec_type_t *h_type, const char *tagname,
char type, hts_expr_val_t *result) {
sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, tagname, NULL);
if (tag == NULL || tag->len < 3) {
result->is_str = (type == 'Z');
result->s.l = 0;
result->d = 0.0;
result->is_true = 0;
return 0;
}

result->is_true = 1;
switch (type) {
case 'f':
result->is_str = 0;
result->d = strtod(&tag->str[3], NULL);
return 0;

case 'i':
result->is_str = 0;
result->d = strtoll(&tag->str[3], NULL, 0);
return 0;

case 'Z':
result->is_str = 1;
ks_clear(&result->s);
return (kputsn(&tag->str[3], tag->len-3, &result->s) >= 0)? 0 : -1;
}

return -1;
}

static int sam_hrecs_sym_lookup(void *data, char *str, char **end,
hts_expr_val_t *result) {
sam_hrec_type_t *h_type = (sam_hrec_type_t *) data;

switch (*str) {
case 't':
if (memcmp(str, "type", 4) == 0) {
*end = &str[4];
result->is_str = 1;
ks_clear(&result->s);
kputc_(h_type->type >> 8, &result->s);
kputc(h_type->type & 0xff, &result->s);
return (ks_len(&result->s) == 2)? 0 : -1;
}
break;

case '@':
if (isalpha_c(str[1]) && isalpha_c(str[2])) {
*end = &str[3];
result->is_str = 0;
result->is_true = result->d = (h_type->type == TYPEKEY(&str[1]));
return 0;
}
break;

case '[':
if (str[1] & str[2]) {
if (str[3] == ']') {
*end = &str[4];
return sam_hrecs_tag_lookup(h_type, &str[1], 'Z', result);
}
else if (str[3] == ':' && str[4] && str[5] == ']') {
if (!strchr("fiZ", str[4])) {
hts_log_error("Unrecognised %.6s type code", str);
return -1;
}
*end = &str[6];
return sam_hrecs_tag_lookup(h_type, &str[1], str[4], result);
}
}
break;
}

return -1;
}

/*! Update sam_hdr_t target_name and target_len arrays
*
* @return 0 on success; -1 on failure
Expand Down Expand Up @@ -1271,6 +1350,18 @@ int sam_hdr_rebuild(sam_hdr_t *bh) {
return 0;
}

sam_hdr_line_t *sam_hdr_first_line(sam_hdr_t *bh) {
if (!bh->hrecs) {
if (sam_hdr_fill_hrecs(bh) != 0) return NULL;
}
return bh->hrecs->first_line;
}

sam_hdr_line_t *sam_hdr_next_line(sam_hdr_t *bh, sam_hdr_line_t *line) {
line = line->global_next;
return (line != bh->hrecs->first_line)? line : NULL;
}

/*
* Appends a formatted line to an existing SAM header.
* Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
Expand Down Expand Up @@ -1346,6 +1437,13 @@ int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
return ret;
}

int sam_hdr_format_line_append(const sam_hdr_line_t *line, kstring_t *ks) {
if (!line) return -1;

if (build_header_line(line, ks) < 0) return -2;
return 0;
}

/*
* Returns a complete line of formatted text for a specific head type/ID
* combination. If ID_key is NULL then it returns the first line of the specified
Expand Down Expand Up @@ -1399,6 +1497,27 @@ int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
return 0;
}

/*
* Remove a line from the header via an iterator.
*/

sam_hdr_line_t *sam_hdr_remove_line(sam_hdr_t *bh, sam_hdr_line_t *line) {
if (!bh || !line) return NULL;

if (line->type == TYPEKEY("PG")) {
hts_log_warning("Removing PG lines is not supported!");
return NULL;
}

sam_hdr_line_t *next = sam_hdr_next_line(bh, line);
char type[2] = { line->type >> 8, line->type & 0xff };
if (sam_hrecs_remove_line(bh->hrecs, type, line, 1) < 0) return NULL;

if (bh->hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) return NULL;
if (bh->hrecs->dirty) redact_header_text(bh);
return next;
}

/*
* Remove a line from the header by specifying a tag:value that uniquely
* identifies a line, i.e. the @SQ line containing "SN:ref1".
Expand Down Expand Up @@ -1768,6 +1887,21 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *
return ret;
}

int sam_hdr_passes_filter(sam_hdr_t *bh, sam_hdr_line_t *line, hts_filter_t *filt) {
hts_expr_val_t result = HTS_EXPR_VAL_INIT;
int ret;
if (hts_filter_eval2(filt, line, sam_hrecs_sym_lookup, &result) >= 0) {
ret = result.is_true;
}
else {
hts_log_error("Couldn't process sam_hdr filter expression");
ret = -1;
}

hts_expr_val_free(&result);
return ret;
}

int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
int count;
sam_hrec_type_t *first_ty, *itr_ty;
Expand Down Expand Up @@ -1908,6 +2042,19 @@ const char *sam_hdr_line_name(sam_hdr_t *bh,

/* ==== Key:val level methods ==== */

int sam_hdr_find_tag(const sam_hdr_line_t *line,
const char *key,
kstring_t *ks) {
if (!line || !key) return -2;

sam_hrec_tag_t *tag = sam_hrecs_find_key(line, key, NULL);
if (!tag || tag->len < 3) return -1;

ks_clear(ks);
if (kputsn(&tag->str[3], tag->len-3, ks) < 0) return -2;
return 0;
}

int sam_hdr_find_tag_id(sam_hdr_t *bh,
const char *type,
const char *ID_key,
Expand Down Expand Up @@ -1940,6 +2087,16 @@ int sam_hdr_find_tag_id(sam_hdr_t *bh,
return 0;
}

int sam_hdr_remove_tag(sam_hdr_t *bh,
sam_hdr_line_t *line,
const char *key) {
if (!bh || !line || !key) return -1;

int ret = sam_hrecs_remove_key(bh->hrecs, line, key);
if (ret == 0 && bh->hrecs->dirty) redact_header_text(bh);
return ret;
}

int sam_hdr_find_tag_pos(sam_hdr_t *bh,
const char *type,
int pos,
Expand Down Expand Up @@ -2635,7 +2792,7 @@ static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
* Returns the tag pointer on success
* NULL on failure
*/
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
sam_hrec_tag_t *sam_hrecs_find_key(const sam_hrec_type_t *type,
const char *key,
sam_hrec_tag_t **prev) {
sam_hrec_tag_t *tag, *p = NULL;
Expand Down
2 changes: 1 addition & 1 deletion header.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ void sam_hrecs_free(sam_hrecs_t *hrecs);
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
const char *ID_key, const char *ID_value);

sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
sam_hrec_tag_t *sam_hrecs_find_key(const sam_hrec_type_t *type,
const char *key,
sam_hrec_tag_t **prev);

Expand Down
93 changes: 90 additions & 3 deletions htslib/sam.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ DEALINGS IN THE SOFTWARE. */
extern "C" {
#endif

struct hts_filter_t;
struct sam_hrec_type_s;

/// Highest SAM format version supported by this library
#define SAM_FORMAT_VERSION "1.6"

Expand Down Expand Up @@ -466,6 +469,22 @@ int sam_hdr_nref(const sam_hdr_t *h);

/* ==== Line level methods ==== */

/*! @typedef
* @abstract Opaque type used as an iterator over header lines.
*/
typedef struct sam_hrec_type_s sam_hdr_line_t;

/// Return an iterator pointing to the first header line
HTSLIB_EXPORT
sam_hdr_line_t *sam_hdr_first_line(sam_hdr_t *h);

/// Return an iterator pointing to the next header line
/*!
* @return An iterator pointing to the next line, or NULL if there is none.
*/
HTSLIB_EXPORT
sam_hdr_line_t *sam_hdr_next_line(sam_hdr_t *h, sam_hdr_line_t *line);

/// Add formatted lines to an existing header.
/*!
* @param lines Full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
Expand Down Expand Up @@ -498,6 +517,36 @@ int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len);
HTSLIB_EXPORT
int sam_hdr_add_line(sam_hdr_t *h, const char *type, ...);

/// Returns a complete line of formatted text for the line pointed to.
/*!
* @param line Iterator pointing to a header line
* @param ks kstring to which to append the result
* @return 0 on success;
* -1 if @p line does not point to a header line
* -2 on other failures
*
* Puts a complete line of formatted text for a specific line into @p ks.
* Appends the text to the existing content in @p ks, if any.
*/
HTSLIB_EXPORT
int sam_hdr_format_line_append(const sam_hdr_line_t *line, kstring_t *ks);

/// Returns a complete line of formatted text for the line pointed to.
/*!
* @param line Iterator pointing to a header line
* @param ks kstring to hold the result
* @return 0 on success;
* -1 if @p line does not point to a header line
* -2 on other failures
*
* Puts a complete line of formatted text for a specific line into @p ks.
* Any existing content in @p ks will be overwritten.
*/
static inline int sam_hdr_format_line(const sam_hdr_line_t *line, kstring_t *ks)
{
return sam_hdr_format_line_append(line, ks_clear(ks));
}

/// Returns a complete line of formatted text for a given type and ID.
/*!
* @param type Type of the searched line. Eg. "SQ"
Expand Down Expand Up @@ -536,6 +585,14 @@ HTSLIB_EXPORT
int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type,
int pos, kstring_t *ks);

/// Remove line pointed to by iterator from a header
/*!
* @param line Iterator pointing to a header line
* @return An iterator pointing to the following line, or NULL on error FIXME or if it was the last line
*/
HTSLIB_EXPORT
sam_hdr_line_t *sam_hdr_remove_line(sam_hdr_t *h, sam_hdr_line_t *line);

/// Remove a line with given type / id from a header
/*!
* @param type Type of the searched line. Eg. "SQ"
Expand Down Expand Up @@ -687,6 +744,21 @@ const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos);

/* ==== Key:val level methods ==== */

/// Return the value associated with a key for a header line identified by iterator
/*!
* @param line Iterator pointing to a header line
* @param key Key of the searched tag. Eg. "LN"
* @param ks kstring where the value will be written
* @return 0 on success
* -1 if the requested tag does not exist
* -2 on other errors
*
* Looks for a specific key in the SAM header line pointed to by @p line and writes the
* associated value into @p ks. Any pre-existing content in @p ks will be overwritten.
*/
HTSLIB_EXPORT
int sam_hdr_find_tag(const sam_hdr_line_t *line, const char *key, kstring_t *ks);

/// Return the value associated with a key for a header line identified by ID_key:ID_val
/*!
* @param type Type of the line to which the tag belongs. Eg. "SQ"
Expand Down Expand Up @@ -724,6 +796,15 @@ int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, cons
HTSLIB_EXPORT
int sam_hdr_find_tag_pos(sam_hdr_t *h, const char *type, int pos, const char *key, kstring_t *ks);

/// Remove the key from the line pointed to by the iterator.
/*!
* @param line Iterator pointing to a header line
* @param key Key of the targeted tag. Eg. "M5"
* @return 1 if the key was removed; 0 if it was not present; -1 on error
*/
HTSLIB_EXPORT
int sam_hdr_remove_tag(sam_hdr_t *h, sam_hdr_line_t *line, const char *key);

/// Remove the key from the line identified by type, ID_key and ID_value.
/*!
* @param type Type of the line to which the tag belongs. Eg. "SQ"
Expand Down Expand Up @@ -779,6 +860,15 @@ hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid);
*/
static inline int bam_name2id(sam_hdr_t *h, const char *ref) { return sam_hdr_name2tid(h, ref); }

/// Check whether a header line passes an hts_filter
/** @param h Pointer to the header structure previously read
* @param line Iterator pointing to a header line
* @param filt Pointer to the filter, created from hts_filter_init
* @return 1 if it passes, 0 if not, and <0 on error
*/
HTSLIB_EXPORT
int sam_hdr_passes_filter(sam_hdr_t *h, sam_hdr_line_t *line, struct hts_filter_t *filt);

/// Generate a unique \@PG ID: value
/*!
* @param name Name of the program. Eg. samtools
Expand Down Expand Up @@ -1425,9 +1515,6 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
HTSLIB_EXPORT
int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED;

// Forward declaration, see hts_expr.h for full.
struct hts_filter_t;

/// sam_passes_filter - Checks whether a record passes an hts_filter.
/** @param h Pointer to the header structure previously read
* @param b Pointer to the BAM record to be checked
Expand Down