Source code for snplib.finalreport._finalreport

#!/usr/bin/env python
# coding: utf-8
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
__all__ = ("FinalReport",)

import re
from functools import reduce
from pathlib import Path

import pandas as pd
from numpy import nan


[docs] class FinalReport(object): """ File that contains SNP information. File processing is triggered by the handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx conversion file, the processed data will contain NAN values. :param allele: A variant form of a single nucleotide polymorphism (SNP), a specific polymorphic site or a whole gene detectable at a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'. :param sep: Delimiter to use. Default value: "\\t". :param usecols: Selection of fields for reading. Accelerates processing and reduces memory. :param dtype: Data type(s) to apply to either the whole dataset or individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}. Example: [Header] GSGT Version 2.0.4 Processing Date 10/14/2021 4:02 PM Content BovineSNP50_v3_A1.bpm Num SNPs 53218 Total SNPs 53218 Num Samples 3 Total Samples 3 [Data] SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score ABCA12 1 A A 0.4048 0.8164 APAF1 1 B B 0.9067 0.9155 ... """ __PATTERN_HEADER = re.compile(r'(^\[Header])') __PATTERN_DATA = re.compile(r'(^\[Data])') __slots__ = ( "_delimiter", "__allele", "__usecols", "__dtype", "__snp_data", "__header", "_map_rn", ) def __init__( self, allele: str | list | None = None, usecols: list[str] | None = None, dtype: dict | None = None, sep: str = "\t" ) -> None: self._delimiter = sep self.__allele = allele self.__usecols = usecols self.__dtype = dtype # self._full_data = None self.__snp_data: pd.DataFrame | None = None self.__header = {} self._map_rn = None @property def header(self) -> dict: return self.__header @property def snp_data(self) -> pd.DataFrame | None: return self.__snp_data
[docs] def handle( self, file_rep: Path | str, conv_file: Path | str = None ) -> bool: """ Processes the FinalReport.txt file. Highlights meta information and data. :param file_rep: The file FinalReport.txt or another name. :param conv_file: The file that contains IDs of registration numbers of animals. :return: Returns true if file processing was successful, false if there were errors. """ try: if self.__allele is not None and self.__usecols is not None: raise Exception("Error. Usecols is used for allele is none.") if isinstance(file_rep, str): file_rep = Path(file_rep) if not file_rep.is_file() and not file_rep.exists(): return False # Processing conversion file if conv_file is not None: if isinstance(conv_file, str): conv_file = Path(conv_file) if not conv_file.is_file() and not conv_file.exists(): return False self.__convert_s_id(conv_file) # # Processing report file self.__handler_header(file_rep) self.__handler_data(file_rep) if not self.__snp_data.empty and self._map_rn is not None: self.__snp_data['Sample ID'] = \ self.__snp_data['Sample ID'].map( dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY)) ) except Exception as e: raise e return True
def __handler_header(self, file_rep: Path) -> None: """ Processes data from a file, selects meta-information. :param file_rep: path, pointer to the file to be read. """ with open(file_rep, 'r') as file: for line in file: if self.__class__.__PATTERN_DATA.findall(line.strip()): return if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\ len(line.strip()) == 0: continue key = line.strip().split("\t")[0] value = line.strip().split("\t")[1] self.__header[key] = value def __handler_data(self, file_rep: Path) -> None: """ Processes data and forms an array for further processing. :param file_rep: path, pointer to the file to be read. """ with open(file_rep, 'r') as file: # Search for the data start index and skip for line in file: if self.__class__.__PATTERN_DATA.findall(line.strip()): break # line column orig_name_col = file.readline().strip().split(self._delimiter) if self.__allele is None and self.__usecols is None: self.__snp_data = pd.read_csv( file, sep=self._delimiter, header=None, names=orig_name_col, dtype=self.__dtype, low_memory=True, na_filter=True ) return sub_n_col = self.__processing_columns(orig_name_col) self.__snp_data = pd.read_csv( file, sep=self._delimiter, header=None, names=orig_name_col, usecols=sub_n_col, dtype=self.__dtype, low_memory=True, na_filter=True ) return def __processing_columns(self, lst_col: list[str]) -> list[str] | None: """ Processing the line with all the names of the fields and the sample of them. :param lst_col: List of all fields. :return: Returns a tuple with a list of names of selected fields. """ if self.__usecols is not None: check_n_col = [ item for item in self.__usecols if item in lst_col ] # Check on empty list if check_n_col: return self.__usecols raise Exception( f"Error. The USECOLS list contains not true fields." ) # processing alleles sample_n_col = self.__sample_by_allele(lst_col) if sample_n_col is None: raise Exception( f"Error. Allele {self.__allele} not in data." ) return sample_n_col def __sample_by_allele(self, names: list[str]) -> list[str] | None: """ Method that generates a list of field names choosing which alleles to keep :param names: List of field names in the report file. :return: Returns a filtered list of fields by alleles. """ allele_templ = r'(^Allele\d\s[:-]\s{}\b)' match self.__allele: case None: return names case str(): allele_pattern = re.compile( allele_templ.format(self.__allele) ) case list() | tuple() | set(): allele_pattern = re.compile( allele_templ.format("|".join(self.__allele)) ) case _: return None lst_allele = reduce( lambda i, j: i + j, [allele_pattern.findall(item) for item in names] ) if len(lst_allele) == 0: return None exclude_alleles = [ item for item in names if item.startswith("Allele") and item not in lst_allele ] return list(filter( lambda x: True if x not in exclude_alleles else False, names )) def __convert_s_id(self, path_file: Path) -> None: """Converts sample id which is in FinalReport to animal registration number. :param path_file: xlsx file with animal numbers label """ self._map_rn = pd.read_excel( path_file, header=None, names=['SID', 'UNIQ_KEY', 'SEX'], index_col=False ) if self._map_rn.empty: self._map_rn = None return if self._map_rn.SID.dtypes == "O": self._map_rn.SID = self._map_rn.SID.str.strip() self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip() if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY): raise Exception("Error. Unique keys contain Cyrillic alphabet.")
[docs] @staticmethod def _check_on_ru_symbols(seq: pd.Series) -> bool | None: """ Checial verification of the Cyrillic :param seq: Squeezed for verification. :return: Truth if there are no symbols of Cyril and there is a lie if there is. """ return seq.apply( lambda x: bool(re.search('[а-яА-Я]', x)) if x is not nan else x ).any()