Module cstag.lengthen
Expand source code
from __future__ import annotations
import re
from cstag.utils.validator import validate_cs_tag, validate_short_format
def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
"""Convert short format of cs tag into long format
Args:
cs_tag (str): cs tag in **short** form
cigar (str): CIGAR string (6th column in SAM file)
seq (str): segment sequence (10th column in SAM file)
prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
Return:
str: cs tag in **long** form
Example:
>>> import cstag
>>> cs = ":4*ag:3"
>>> cigar = "8M"
>>> seq = "ACGTACGT"
>>> cstag.lengthen(cs, cigar, seq)
'=ACGT*ag=CGT'
"""
validate_cs_tag(cs_tag)
validate_short_format(cs_tag)
cs_tag_split = re.split(r"([-+*~:])", cs_tag.replace("cs:Z:", ""))[1:]
cs_tag_split = [i + j for i, j in zip(cs_tag_split[0::2], cs_tag_split[1::2])]
softclip = re.sub(r"^([0-9]+)S.*", r"\1", cigar)
idx = int(softclip) if softclip.isdigit() else 0
cslong = []
for cs in cs_tag_split:
if cs == "":
continue
if cs[0] == ":":
cs = int(cs[1:]) + idx
cslong.append(":" + seq[idx:cs])
idx = cs
continue
cslong.append(cs)
if cs[0] == "*":
idx += 1
if cs[0] == "+":
idx += len(cs) - 1
cslong = "".join(cslong).replace(":", "=")
return f"cs:Z:{cslong}" if prefix else cslong
Functions
def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) ‑> str
-
Convert short format of cs tag into long format
Args
cs_tag
:str
- cs tag in short form
cigar
:str
- CIGAR string (6th column in SAM file)
seq
:str
- segment sequence (10th column in SAM file)
prefix
:bool
, optional- Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
Return
str: cs tag in long form
Example
>>> import cstag >>> cs = ":4*ag:3" >>> cigar = "8M" >>> seq = "ACGTACGT" >>> cstag.lengthen(cs, cigar, seq) '=ACGT*ag=CGT'
Expand source code
def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str: """Convert short format of cs tag into long format Args: cs_tag (str): cs tag in **short** form cigar (str): CIGAR string (6th column in SAM file) seq (str): segment sequence (10th column in SAM file) prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False Return: str: cs tag in **long** form Example: >>> import cstag >>> cs = ":4*ag:3" >>> cigar = "8M" >>> seq = "ACGTACGT" >>> cstag.lengthen(cs, cigar, seq) '=ACGT*ag=CGT' """ validate_cs_tag(cs_tag) validate_short_format(cs_tag) cs_tag_split = re.split(r"([-+*~:])", cs_tag.replace("cs:Z:", ""))[1:] cs_tag_split = [i + j for i, j in zip(cs_tag_split[0::2], cs_tag_split[1::2])] softclip = re.sub(r"^([0-9]+)S.*", r"\1", cigar) idx = int(softclip) if softclip.isdigit() else 0 cslong = [] for cs in cs_tag_split: if cs == "": continue if cs[0] == ":": cs = int(cs[1:]) + idx cslong.append(":" + seq[idx:cs]) idx = cs continue cslong.append(cs) if cs[0] == "*": idx += 1 if cs[0] == "+": idx += len(cs) - 1 cslong = "".join(cslong).replace(":", "=") return f"cs:Z:{cslong}" if prefix else cslong