Module cstag.mask
Expand source code
from __future__ import annotations
import re
from cstag.utils.validator import validate_cs_tag, validate_long_format, validate_threshold
def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10, prefix: bool = False) -> str:
"""Mask low-quality bases to 'N'
Args:
cs_tag (str): cs tag in the **long** format
cigar (str): cigar strings (6th column in SAM file)
qual (str): ASCII of Phred-scaled base quaiity+33 (11th column in SAM file)
threshold (int, optional): Phred Quality Score (defalt = 10). The low-quality bases are defined as 'less than or equal to the threshold'
prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
Return:
str: Masked cs tag
Example:
>>> import cstag
>>> cs_tag = "=ACGT*ac+gg-cc=T"
>>> cigar = "5M2I2D1M"
>>> qual = "AA!!!!AA"
>>> cstag.mask(cs_tag, cigar, qual)
'=ACNN*an+ng-cc=T'
"""
validate_cs_tag(cs_tag)
validate_long_format(cs_tag)
validate_threshold(threshold)
mask_symbols = [chr(th + 33) for th in range(threshold + 1)]
mask_symbols = set(mask_symbols)
cs = cs_tag.replace("cs:Z:", "")
list_cs = re.split(r"([-+*~=])", cs)[1:]
list_cs = [i + j for i, j in zip(list_cs[0::2], list_cs[1::2])]
if cigar.split("S")[0].isdigit():
softclip = int(cigar.split("S")[0])
qual = qual[softclip:]
cs_masked = []
idx = 0
for cs in list_cs:
cs = list(cs)
if cs[0] == "*":
if qual[idx] in mask_symbols:
cs[-1] = "n"
idx += 1
elif cs[0] == "=" or cs[0] == "+":
for i in range(len(cs) - 1):
if qual[idx + i] in mask_symbols:
cs[i + 1] = "N" if cs[0] == "=" else "n"
idx += i + 1
cs_masked.append("".join(cs))
cs_masked = "".join(cs_masked)
return f"cs:Z:{cs_masked}" if prefix else cs_masked
Functions
def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10, prefix: bool = False) ‑> str
-
Mask low-quality bases to 'N'
Args
cs_tag
:str
- cs tag in the long format
cigar
:str
- cigar strings (6th column in SAM file)
qual
:str
- ASCII of Phred-scaled base quaiity+33 (11th column in SAM file)
threshold
:int
, optional- Phred Quality Score (defalt = 10). The low-quality bases are defined as 'less than or equal to the threshold'
prefix
:bool
, optional- Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
Return
str: Masked cs tag
Example
>>> import cstag >>> cs_tag = "=ACGT*ac+gg-cc=T" >>> cigar = "5M2I2D1M" >>> qual = "AA!!!!AA" >>> cstag.mask(cs_tag, cigar, qual) '=ACNN*an+ng-cc=T'
Expand source code
def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10, prefix: bool = False) -> str: """Mask low-quality bases to 'N' Args: cs_tag (str): cs tag in the **long** format cigar (str): cigar strings (6th column in SAM file) qual (str): ASCII of Phred-scaled base quaiity+33 (11th column in SAM file) threshold (int, optional): Phred Quality Score (defalt = 10). The low-quality bases are defined as 'less than or equal to the threshold' prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False Return: str: Masked cs tag Example: >>> import cstag >>> cs_tag = "=ACGT*ac+gg-cc=T" >>> cigar = "5M2I2D1M" >>> qual = "AA!!!!AA" >>> cstag.mask(cs_tag, cigar, qual) '=ACNN*an+ng-cc=T' """ validate_cs_tag(cs_tag) validate_long_format(cs_tag) validate_threshold(threshold) mask_symbols = [chr(th + 33) for th in range(threshold + 1)] mask_symbols = set(mask_symbols) cs = cs_tag.replace("cs:Z:", "") list_cs = re.split(r"([-+*~=])", cs)[1:] list_cs = [i + j for i, j in zip(list_cs[0::2], list_cs[1::2])] if cigar.split("S")[0].isdigit(): softclip = int(cigar.split("S")[0]) qual = qual[softclip:] cs_masked = [] idx = 0 for cs in list_cs: cs = list(cs) if cs[0] == "*": if qual[idx] in mask_symbols: cs[-1] = "n" idx += 1 elif cs[0] == "=" or cs[0] == "+": for i in range(len(cs) - 1): if qual[idx + i] in mask_symbols: cs[i + 1] = "N" if cs[0] == "=" else "n" idx += i + 1 cs_masked.append("".join(cs)) cs_masked = "".join(cs_masked) return f"cs:Z:{cs_masked}" if prefix else cs_masked