diff --git a/Makefile b/Makefile index ef9b5a9a4..761ffb5e8 100644 --- a/Makefile +++ b/Makefile @@ -459,7 +459,7 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) -header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) +header.o header.pico: header.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) diff --git a/header.c b/header.c index 7f62074f0..35eb3b8c8 100644 --- a/header.c +++ b/header.c @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include "htslib/hts_expr.h" #include "textutils_internal.h" #include "header.h" @@ -898,6 +899,84 @@ static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len return 0; } +static int sam_hrecs_tag_lookup(sam_hrec_type_t *h_type, const char *tagname, + char type, hts_expr_val_t *result) { + sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, tagname, NULL); + if (tag == NULL || tag->len < 3) { + result->is_str = (type == 'Z'); + result->s.l = 0; + result->d = 0.0; + result->is_true = 0; + return 0; + } + + result->is_true = 1; + switch (type) { + case 'f': + result->is_str = 0; + result->d = strtod(&tag->str[3], NULL); + return 0; + + case 'i': + result->is_str = 0; + result->d = strtoll(&tag->str[3], NULL, 0); + return 0; + + case 'Z': + result->is_str = 1; + ks_clear(&result->s); + return (kputsn(&tag->str[3], tag->len-3, &result->s) >= 0)? 0 : -1; + } + + return -1; +} + +static int sam_hrecs_sym_lookup(void *data, char *str, char **end, + hts_expr_val_t *result) { + sam_hrec_type_t *h_type = (sam_hrec_type_t *) data; + + switch (*str) { + case 't': + if (memcmp(str, "type", 4) == 0) { + *end = &str[4]; + result->is_str = 1; + ks_clear(&result->s); + kputc_(h_type->type >> 8, &result->s); + kputc(h_type->type & 0xff, &result->s); + return (ks_len(&result->s) == 2)? 0 : -1; + } + break; + + case '@': + if (isalpha_c(str[1]) && isalpha_c(str[2])) { + *end = &str[3]; + result->is_str = 0; + result->is_true = result->d = (h_type->type == TYPEKEY(&str[1])); + return 0; + } + break; + + case '[': + if (str[1] & str[2]) { + if (str[3] == ']') { + *end = &str[4]; + return sam_hrecs_tag_lookup(h_type, &str[1], 'Z', result); + } + else if (str[3] == ':' && str[4] && str[5] == ']') { + if (!strchr("fiZ", str[4])) { + hts_log_error("Unrecognised %.6s type code", str); + return -1; + } + *end = &str[6]; + return sam_hrecs_tag_lookup(h_type, &str[1], str[4], result); + } + } + break; + } + + return -1; +} + /*! Update sam_hdr_t target_name and target_len arrays * * @return 0 on success; -1 on failure @@ -1271,6 +1350,18 @@ int sam_hdr_rebuild(sam_hdr_t *bh) { return 0; } +sam_hdr_line_t *sam_hdr_first_line(sam_hdr_t *bh) { + if (!bh->hrecs) { + if (sam_hdr_fill_hrecs(bh) != 0) return NULL; + } + return bh->hrecs->first_line; +} + +sam_hdr_line_t *sam_hdr_next_line(sam_hdr_t *bh, sam_hdr_line_t *line) { + line = line->global_next; + return (line != bh->hrecs->first_line)? line : NULL; +} + /* * Appends a formatted line to an existing SAM header. * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with @@ -1346,6 +1437,13 @@ int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) { return ret; } +int sam_hdr_format_line_append(const sam_hdr_line_t *line, kstring_t *ks) { + if (!line) return -1; + + if (build_header_line(line, ks) < 0) return -2; + return 0; +} + /* * Returns a complete line of formatted text for a specific head type/ID * combination. If ID_key is NULL then it returns the first line of the specified @@ -1399,6 +1497,27 @@ int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type, return 0; } +/* + * Remove a line from the header via an iterator. + */ + +sam_hdr_line_t *sam_hdr_remove_line(sam_hdr_t *bh, sam_hdr_line_t *line) { + if (!bh || !line) return NULL; + + if (line->type == TYPEKEY("PG")) { + hts_log_warning("Removing PG lines is not supported!"); + return NULL; + } + + sam_hdr_line_t *next = sam_hdr_next_line(bh, line); + char type[2] = { line->type >> 8, line->type & 0xff }; + if (sam_hrecs_remove_line(bh->hrecs, type, line, 1) < 0) return NULL; + + if (bh->hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) return NULL; + if (bh->hrecs->dirty) redact_header_text(bh); + return next; +} + /* * Remove a line from the header by specifying a tag:value that uniquely * identifies a line, i.e. the @SQ line containing "SN:ref1". @@ -1768,6 +1887,21 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void * return ret; } +int sam_hdr_passes_filter(sam_hdr_t *bh, sam_hdr_line_t *line, hts_filter_t *filt) { + hts_expr_val_t result = HTS_EXPR_VAL_INIT; + int ret; + if (hts_filter_eval2(filt, line, sam_hrecs_sym_lookup, &result) >= 0) { + ret = result.is_true; + } + else { + hts_log_error("Couldn't process sam_hdr filter expression"); + ret = -1; + } + + hts_expr_val_free(&result); + return ret; +} + int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) { int count; sam_hrec_type_t *first_ty, *itr_ty; @@ -1908,6 +2042,19 @@ const char *sam_hdr_line_name(sam_hdr_t *bh, /* ==== Key:val level methods ==== */ +int sam_hdr_find_tag(const sam_hdr_line_t *line, + const char *key, + kstring_t *ks) { + if (!line || !key) return -2; + + sam_hrec_tag_t *tag = sam_hrecs_find_key(line, key, NULL); + if (!tag || tag->len < 3) return -1; + + ks_clear(ks); + if (kputsn(&tag->str[3], tag->len-3, ks) < 0) return -2; + return 0; +} + int sam_hdr_find_tag_id(sam_hdr_t *bh, const char *type, const char *ID_key, @@ -1940,6 +2087,16 @@ int sam_hdr_find_tag_id(sam_hdr_t *bh, return 0; } +int sam_hdr_remove_tag(sam_hdr_t *bh, + sam_hdr_line_t *line, + const char *key) { + if (!bh || !line || !key) return -1; + + int ret = sam_hrecs_remove_key(bh->hrecs, line, key); + if (ret == 0 && bh->hrecs->dirty) redact_header_text(bh); + return ret; +} + int sam_hdr_find_tag_pos(sam_hdr_t *bh, const char *type, int pos, @@ -2635,7 +2792,7 @@ static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) { * Returns the tag pointer on success * NULL on failure */ -sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type, +sam_hrec_tag_t *sam_hrecs_find_key(const sam_hrec_type_t *type, const char *key, sam_hrec_tag_t **prev) { sam_hrec_tag_t *tag, *p = NULL; diff --git a/header.h b/header.h index a98d30663..4ba18a8dd 100644 --- a/header.h +++ b/header.h @@ -290,7 +290,7 @@ void sam_hrecs_free(sam_hrecs_t *hrecs); sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type, const char *ID_key, const char *ID_value); -sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type, +sam_hrec_tag_t *sam_hrecs_find_key(const sam_hrec_type_t *type, const char *key, sam_hrec_tag_t **prev); diff --git a/htslib/sam.h b/htslib/sam.h index 0da5f047f..1bb2a054f 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -44,6 +44,9 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +struct hts_filter_t; +struct sam_hrec_type_s; + /// Highest SAM format version supported by this library #define SAM_FORMAT_VERSION "1.6" @@ -466,6 +469,22 @@ int sam_hdr_nref(const sam_hdr_t *h); /* ==== Line level methods ==== */ +/*! @typedef + * @abstract Opaque type used as an iterator over header lines. + */ +typedef struct sam_hrec_type_s sam_hdr_line_t; + +/// Return an iterator pointing to the first header line +HTSLIB_EXPORT +sam_hdr_line_t *sam_hdr_first_line(sam_hdr_t *h); + +/// Return an iterator pointing to the next header line +/*! + * @return An iterator pointing to the next line, or NULL if there is none. + */ +HTSLIB_EXPORT +sam_hdr_line_t *sam_hdr_next_line(sam_hdr_t *h, sam_hdr_line_t *line); + /// Add formatted lines to an existing header. /*! * @param lines Full SAM header record, eg "@SQ\tSN:foo\tLN:100", with @@ -498,6 +517,36 @@ int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len); HTSLIB_EXPORT int sam_hdr_add_line(sam_hdr_t *h, const char *type, ...); +/// Returns a complete line of formatted text for the line pointed to. +/*! + * @param line Iterator pointing to a header line + * @param ks kstring to which to append the result + * @return 0 on success; + * -1 if @p line does not point to a header line + * -2 on other failures + * + * Puts a complete line of formatted text for a specific line into @p ks. + * Appends the text to the existing content in @p ks, if any. + */ +HTSLIB_EXPORT +int sam_hdr_format_line_append(const sam_hdr_line_t *line, kstring_t *ks); + +/// Returns a complete line of formatted text for the line pointed to. +/*! + * @param line Iterator pointing to a header line + * @param ks kstring to hold the result + * @return 0 on success; + * -1 if @p line does not point to a header line + * -2 on other failures + * + * Puts a complete line of formatted text for a specific line into @p ks. + * Any existing content in @p ks will be overwritten. + */ +static inline int sam_hdr_format_line(const sam_hdr_line_t *line, kstring_t *ks) +{ + return sam_hdr_format_line_append(line, ks_clear(ks)); +} + /// Returns a complete line of formatted text for a given type and ID. /*! * @param type Type of the searched line. Eg. "SQ" @@ -536,6 +585,14 @@ HTSLIB_EXPORT int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type, int pos, kstring_t *ks); +/// Remove line pointed to by iterator from a header +/*! + * @param line Iterator pointing to a header line + * @return An iterator pointing to the following line, or NULL on error FIXME or if it was the last line + */ +HTSLIB_EXPORT +sam_hdr_line_t *sam_hdr_remove_line(sam_hdr_t *h, sam_hdr_line_t *line); + /// Remove a line with given type / id from a header /*! * @param type Type of the searched line. Eg. "SQ" @@ -687,6 +744,21 @@ const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos); /* ==== Key:val level methods ==== */ +/// Return the value associated with a key for a header line identified by iterator +/*! + * @param line Iterator pointing to a header line + * @param key Key of the searched tag. Eg. "LN" + * @param ks kstring where the value will be written + * @return 0 on success + * -1 if the requested tag does not exist + * -2 on other errors + * + * Looks for a specific key in the SAM header line pointed to by @p line and writes the + * associated value into @p ks. Any pre-existing content in @p ks will be overwritten. + */ +HTSLIB_EXPORT +int sam_hdr_find_tag(const sam_hdr_line_t *line, const char *key, kstring_t *ks); + /// Return the value associated with a key for a header line identified by ID_key:ID_val /*! * @param type Type of the line to which the tag belongs. Eg. "SQ" @@ -724,6 +796,15 @@ int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, cons HTSLIB_EXPORT int sam_hdr_find_tag_pos(sam_hdr_t *h, const char *type, int pos, const char *key, kstring_t *ks); +/// Remove the key from the line pointed to by the iterator. +/*! + * @param line Iterator pointing to a header line + * @param key Key of the targeted tag. Eg. "M5" + * @return 1 if the key was removed; 0 if it was not present; -1 on error + */ +HTSLIB_EXPORT +int sam_hdr_remove_tag(sam_hdr_t *h, sam_hdr_line_t *line, const char *key); + /// Remove the key from the line identified by type, ID_key and ID_value. /*! * @param type Type of the line to which the tag belongs. Eg. "SQ" @@ -779,6 +860,15 @@ hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid); */ static inline int bam_name2id(sam_hdr_t *h, const char *ref) { return sam_hdr_name2tid(h, ref); } +/// Check whether a header line passes an hts_filter +/** @param h Pointer to the header structure previously read + * @param line Iterator pointing to a header line + * @param filt Pointer to the filter, created from hts_filter_init + * @return 1 if it passes, 0 if not, and <0 on error + */ +HTSLIB_EXPORT +int sam_hdr_passes_filter(sam_hdr_t *h, sam_hdr_line_t *line, struct hts_filter_t *filt); + /// Generate a unique \@PG ID: value /*! * @param name Name of the program. Eg. samtools @@ -1425,9 +1515,6 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; -// Forward declaration, see hts_expr.h for full. -struct hts_filter_t; - /// sam_passes_filter - Checks whether a record passes an hts_filter. /** @param h Pointer to the header structure previously read * @param b Pointer to the BAM record to be checked