Source code for protmapper.api

import os
import csv
import pickle
import logging
from requests.exceptions import HTTPError
from protmapper.resources import resource_dir_path
from protmapper import phosphosite_client, uniprot_client


logger = logging.getLogger(__name__)


valid_aas = ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')


[docs]class InvalidSiteException(Exception): pass
[docs]class MappedSite(object): """Represent details of a site that was mapped. Attributes ---------- up_id : str The UniProt ID of the protein whose site was mapped. error_code : str or None One of several strings indicating an error in retrieving the protein sequence, or None if there was no error. Error codes include 'NO_UNIPROT_ID' if the given gene name could not be converted into a Uniprot ID; 'UNIPROT_HTTP_NOT_FOUND' if the given Uniprot ID resulted in a 404 Not Found error from the Uniprot web service; or 'UNIPROT_HTTP_OTHER' if it was any other type of Uniprot web service error. Any other unexpected errors in getting the sequence are assigned the 'UNIPROT_OTHER' code. If the error code is not None, the `orig_res` and `orig_pos` fields will be set (based on the query arguments) but all other fields will be None. valid : bool True if the original site was valid with respect to the given protein, False otherwise. Further, in case of an error (if error_code is not None), it is set to None. orig_res : str The original amino acid residue that was mapped. orig_pos : str The original amino acid position that was mapped. mapped_id : str The Uniprot ID for the protein containing the mapped site. If `up_id` is the Uniprot ID for the human reference sequence, in most cases this will match; however, exceptions will occur if the site position refers to a site that is unique to a particular isoform. mapped_res : str The mapped amino acid residue. mapped_pos : str The mapped amino acid position. description : str A description of the mapping that was done, comes from a fixed set of codes of types of mapping that were performed. gene_name : str The standard (HGNC) gene name of the protein that was mapped. """ attrs = ('up_id', 'error_code', 'valid', 'orig_res', 'orig_pos', 'mapped_id', 'mapped_res', 'mapped_pos', 'description', 'gene_name') def __init__(self, up_id, valid, orig_res, orig_pos, error_code=None, mapped_id=None, mapped_res=None, mapped_pos=None, description=None, gene_name=None): self.up_id = up_id self.error_code = error_code self.valid = valid self.orig_res = orig_res self.orig_pos = orig_pos self.mapped_id = mapped_id self.mapped_res = mapped_res self.mapped_pos = mapped_pos self.description = description self.gene_name = gene_name def __repr__(self): quote_args = lambda args: tuple([a if a in (None, True, False) else ("'%s'" % a) for a in args]) return ("MappedSite(up_id=%s, error_code=%s, valid=%s, " "orig_res=%s, orig_pos=%s, mapped_id=%s, mapped_res=%s, " "mapped_pos=%s, description=%s, gene_name=%s)" % quote_args([self.up_id, self.error_code, self.valid, self.orig_res, self.orig_pos, self.mapped_id, self.mapped_res, self.mapped_pos, self.description, self.gene_name])) def __eq__(self, other): return hash(self) == hash(other) def __ne__(self, other): return not(self == other) def __hash__(self): return hash(tuple(self.to_list())) def to_json(self): jd = {a: self.__dict__.get(a) for a in self.attrs} return jd def to_list(self): return [self.__getattribute__(a) for a in self.attrs]
[docs] def not_invalid(self): """Return True if the original site is not known to be invalid. Returns ------- bool True if the original site is valid or if there is an error code, which implicitly means that the validity of the original site could not be established. False otherwise. """ return self.valid or self.error_code is not None
[docs] def has_mapping(self): """Return True if the original site was mapped successfully. Returns ------- bool True if a mapping was successfully obtained for the site, False otherwise. """ return (not self.not_invalid()) and \ (self.mapped_pos is not None and self.mapped_res is not None)
[docs]class ProtMapper(object): """ Use curated site information to standardize modification sites in stmts. Parameters ---------- site_map : dict (as returned by :py:func:`load_site_map`) A dict mapping tuples of the form `(gene, orig_res, orig_pos)` to a tuple of the form `(correct_res, correct_pos, comment)`, where `gene` is the string name of the gene (canonicalized to HGNC); `orig_res` and `orig_pos` are the residue and position to be mapped; `correct_res` and `correct_pos` are the corrected residue and position, and `comment` is a string describing the reason for the mapping (species error, isoform error, wrong residue name, etc.). use_cache : Optional[bool] If True, the SITEMAPPER_CACHE_PATH from the config (or environment) is loaded and cached mappings are read and written to the given path. Otherwise, no cache is used. Default: False Examples -------- Fixing site errors on both the modification state of an agent (MAP2K1) and the target of a Phosphorylation statement (MAPK1): >>> map2k1_phos = Agent('MAP2K1', db_refs={'UP':'Q02750'}, mods=[ ... ModCondition('phosphorylation', 'S', '217'), ... ModCondition('phosphorylation', 'S', '221')]) >>> mapk1 = Agent('MAPK1', db_refs={'UP':'P28482'}) >>> stmt = Phosphorylation(map2k1_phos, mapk1, 'T','183') >>> (valid, mapped) = default_mapper.map_sites([stmt]) >>> valid [] >>> mapped # doctest:+IGNORE_UNICODE [ MappedStatement: original_stmt: Phosphorylation(MAP2K1(mods: (phosphorylation, S, 217), (phosphorylation, S, 221)), MAPK1(), T, 183) mapped_mods: (('MAP2K1', 'S', '217'), ('S', '218', 'off by one')) (('MAP2K1', 'S', '221'), ('S', '222', 'off by one')) (('MAPK1', 'T', '183'), ('T', '185', 'off by two; mouse sequence')) mapped_stmt: Phosphorylation(MAP2K1(mods: (phosphorylation, S, 218), (phosphorylation, S, 222)), MAPK1(), T, 185) ] >>> ms = mapped[0] >>> ms.original_stmt Phosphorylation(MAP2K1(mods: (phosphorylation, S, 217), (phosphorylation, S, 221)), MAPK1(), T, 183) >>> ms.mapped_mods # doctest:+IGNORE_UNICODE [(('MAP2K1', 'S', '217'), ('S', '218', 'off by one')), (('MAP2K1', 'S', '221'), ('S', '222', 'off by one')), (('MAPK1', 'T', '183'), ('T', '185', 'off by two; mouse sequence'))] >>> ms.mapped_stmt Phosphorylation(MAP2K1(mods: (phosphorylation, S, 218), (phosphorylation, S, 222)), MAPK1(), T, 185) """ def __init__(self, site_map=None, use_cache=False, cache_path=None): if site_map is None: site_map = load_site_map(default_site_map_path) self.site_map = site_map # Set default cache path if cache_path is None: cache_path = os.path.join(resource_dir_path, 'sm_cache.pkl') self._cache_path = cache_path self.use_cache = use_cache self._cache = {} if self.use_cache: if os.path.exists(self._cache_path): with open(self._cache_path, 'rb') as f: self._cache = pickle.load(f) print("Loaded cache of length %d from %s" % (len(self._cache), self._cache_path)) else: print("No cache found at %s, one will be created." % self._cache_path) def save_cache(self): with open(self._cache_path, 'wb') as f: pickle.dump(self._cache, f) def __del__(self): try: if self.use_cache: self.save_cache() except: pass
[docs] def map_sitelist_to_human_ref(self, site_list, **kwargs): """Return a list of mapped sites for a list of input sites. Parameters ---------- site_list : list of tuple Each tuple in the list consists of the following entries: (prot_id, prot_ns, residue, position). Returns ------- list of :py:class:`protmapper.api.MappedSite` A list of MappedSite objects, one corresponding to each site in the input list. """ mapped_sites = [] for ix, (prot_id, prot_ns, residue, position) in enumerate(site_list): logger.info("Mapping site %d of %d, cache size %d" % (ix + 1, len(site_list), len(self._cache))) try: ms = self.map_to_human_ref(prot_id, prot_ns, residue, position, **kwargs) mapped_sites.append(ms) except Exception as e: logger.error("Error occurred mapping site " "(%s, %s, %s, %s): %s" % (prot_id, prot_ns, residue, position, str(e))) return mapped_sites
[docs] def map_to_human_ref(self, prot_id, prot_ns, residue, position, do_methionine_offset=True, do_orthology_mapping=True, do_isoform_mapping=True): """Check an agent for invalid sites and look for mappings. Look up each modification site on the agent in Uniprot and then the site map. Parameters ---------- prot_id : str A Uniprot ID or HGNC gene symbol for the protein. prot_ns : str One of 'uniprot' or 'hgnc' indicating the type of ID given. residue : str Residue to map on the protein to check for validity and map. position : str Position of the residue to check for validity and map. do_methionine_offset : boolean Whether to check for off-by-one errors in site position (possibly) attributable to site numbering from mature proteins after cleavage of the initial methionine. If True, checks the reference sequence for a known modification at 1 site position greater than the given one; if there exists such a site, creates the mapping. Default is True. do_orthology_mapping : boolean Whether to check sequence positions for known modification sites in mouse or rat sequences (based on PhosphoSitePlus data). If a mouse/rat site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. do_isoform_mapping : boolean Whether to check sequence positions for known modifications in other human isoforms of the protein (based on PhosphoSitePlus data). If a site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. Returns ------- MappedSite The MappedSite object gives information on results of mapping the site. See :py:class:`protmapper.api.MappedSite` documentation for details. """ # Check the protein ID and namespace if prot_id is None: raise ValueError("prot_id must not be None.") if prot_ns not in ('uniprot', 'hgnc'): raise ValueError("prot_ns must be either 'uniprot' or 'hgnc' (for " "HGNC symbols)") # Get Uniprot ID and gene name up_id = _get_uniprot_id(prot_id, prot_ns) # If an HGNC ID was given and the uniprot entry is not found, flag # as error if up_id is None: assert prot_ns == 'hgnc' and prot_id is not None return MappedSite(None, None, residue, position, gene_name=prot_id, error_code='NO_UNIPROT_ID') # Make sure the sites are proper amino acids/positions try: valid_res, valid_pos = _validate_site(residue, position) except InvalidSiteException as ex: return MappedSite(up_id, None, residue, position, error_code='INVALID_SITE', description=str(ex)) # Get the gene name from Uniprot gene_name = uniprot_client.get_gene_name(up_id, web_fallback=False) site_key = (up_id, residue, position) # First, check the cache to potentially avoid a costly sequence # lookup cached_site = self._cache.get(site_key) if cached_site is not None: return cached_site # If not cached, continue # Look up the residue/position in uniprot try: site_valid = uniprot_client.verify_location(up_id, residue, position) error_code = None except HTTPError as ex: if ex.response.status_code in {400, 404}: error_code = 'UNIPROT_HTTP_NOT_FOUND' else: error_code = 'UNIPROT_HTTP_OTHER' except Exception as ex: error_code = 'UNIPROT_OTHER' logger.error(ex) if error_code: # Set error_code; valid will set to None, not True/False mapped_site = MappedSite(up_id, None, residue, position, error_code=error_code) return mapped_site # It's a valid site if site_valid: mapped_site = MappedSite(up_id, True, residue, position, description='VALID', gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site # If it's not a valid site, check the site map first curated_site = self.site_map.get(site_key, None) # Manually mapped in the site map if curated_site is not None: mapped_res, mapped_pos, description = curated_site mapped_site = MappedSite(up_id, False, residue, position, mapped_id=up_id, mapped_res=mapped_res, mapped_pos=mapped_pos, description=description, gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site # There is no manual mapping, next we try to see if UniProt # reports a signal peptide that could be responsible for the position # being shifted signal_peptide = uniprot_client.get_signal_peptide(up_id, False) # If there is valid signal peptide information from UniProt if signal_peptide and signal_peptide.begin == 1 and \ signal_peptide.end is not None: offset_pos = str(int(position) + signal_peptide.end) # Check to see if the offset position is known to be phosphorylated mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, offset_pos, 'SIGNAL_PEPTIDE_REMOVED') if mapped_site: return mapped_site # ...there's no manually curated site or signal peptide, so do mapping # via PhosphoSite if the data is available: human_prot = uniprot_client.is_human(up_id) if phosphosite_client.has_data(): # First, look for other entries in phosphosite for this protein # where this sequence position is legit (i.e., other isoforms) if do_isoform_mapping and up_id and human_prot: mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, position, 'INFERRED_ALTERNATIVE_ISOFORM') if mapped_site: return mapped_site # Try looking for rat or mouse sites if do_orthology_mapping and up_id and human_prot: # Get the mouse ID for this protein up_mouse = uniprot_client.get_mouse_id(up_id) # Get mouse sequence mapped_site = self.get_psp_mapping( up_id, up_mouse, gene_name, residue, position, position, 'INFERRED_MOUSE_SITE') if mapped_site: return mapped_site # Try the rat sequence up_rat = uniprot_client.get_rat_id(up_id) mapped_site = self.get_psp_mapping( up_id, up_rat, gene_name, residue, position, position, 'INFERRED_RAT_SITE') if mapped_site: return mapped_site # Check for methionine offset (off by one) if do_methionine_offset and up_id and human_prot: offset_pos = str(int(position) + 1) mapped_site = self.get_psp_mapping( up_id, up_id, gene_name, residue, position, offset_pos, 'INFERRED_METHIONINE_CLEAVAGE') if mapped_site: return mapped_site # If we've gotten here, the entry is 1) not in the site map, and # 2) we either don't have PSP data or no mapping was found using PSP mapped_site = MappedSite(up_id, False, residue, position, description='NO_MAPPING_FOUND', gene_name=gene_name) self._cache[site_key] = mapped_site return mapped_site
[docs] def get_psp_mapping(self, orig_id, query_id, gene_name, res, pos, query_pos, mapping_code): """ Wrapper around Phosphosite queries that performs peptide remapping. The function is called with a uniprot ID, residue, and position combination that is used to query the phosphosite_client for a valid corresponding site on the human reference protein. The `mapping_code` is provided by the caller to indicate the type of mapping being attempted (e.g., human isoform, mouse, rat, methionine). If a valid mapping is obtained, this is the error code that is applied. If a valid mapping is obtained but it is for a human isoform, this indicates that the queried site exists only on a human isoform and not on the human reference protein, and the code `ISOFORM_SPECIFIC_SITE` is used. If the site returned by the phosphosite_client is at a position that does not match the Uniprot reference sequence (which can happen when the queried site and the PhosphositePlus protein sequences both exclude the initial methionine), the site is remapped to the Uniprot reference sequence using the peptide information for the site in PhosphositePlus. In these cases, the mapping code `REMAPPED_FROM_PSP_SEQUENCE` is used. Parameters ---------- orig_id : str Original Uniprot ID of the protein to be mapped. query_id : str Uniprot ID of the protein being queried for sites. This may differ from `orig_id` if the orthologous mouse or rat protein is being checked for sites. gene_name : str Gene name of the protein. res : str Residue of the site to be mapped. pos : str Position of the site to be mapped. query_pos : str Position being queried for a mapping. This differs from `pos` when off-by-one (methionine) errors are being checked. mapping_code : str Mapping code to apply in case of a successful mapping, e.g. `INFERRED_ALTERNATIVE_ISOFORM`, `INFERRED_MOUSE_SITE`, etc. Returns ------- MappedSite or None MappedSite object containing the mapping, or None indicating that no mapping was found. """ pspmapping = phosphosite_client.map_to_human_site(query_id, res, query_pos) # If no mapping, return None if pspmapping is None: return None # If there is a mapping, check to make sure that it is valid wrt to the # reference sequence human_pos = pspmapping.mapped_pos # Check if the site mapped from PSP is valid in the Uniprot sequence # for the ID that we're interested in # PSP sometimes returns a non-UP ID like NP_001184222 which we want # to control for here, we do that by looking up the mnemonic if not uniprot_client.get_mnemonic(pspmapping.mapped_id, web_fallback=False): return MappedSite(orig_id, None, res, pos, error_code='PSP_MAPPED_ID_NOT_UP') # At this point the ID is supposed to be valid UP try: site_valid = uniprot_client.verify_location(pspmapping.mapped_id, pspmapping.mapped_res, pspmapping.mapped_pos) error_code = None except HTTPError as ex: if ex.response.status_code == 404: error_code = 'UNIPROT_HTTP_NOT_FOUND' else: error_code = 'UNIPROT_HTTP_OTHER' except Exception as ex: error_code = 'UNIPROT_OTHER' logger.error(ex) if error_code: # Set error_code; valid will set to None, not True/False mapped_site = MappedSite(orig_id, None, res, pos, error_code=error_code) return mapped_site # If the mapped site is valid, we're done! if site_valid: # If the residue is different, change the code accordingly mapped_site = MappedSite(orig_id, False, res, pos, mapped_id=pspmapping.mapped_id, mapped_res=pspmapping.mapped_res, mapped_pos=human_pos, description=mapping_code, gene_name=gene_name) else: # If mapped site is invalid, attempt to re-map based on the seq updated_pos = ProtMapper.map_peptide(orig_id, pspmapping.motif, pspmapping.respos) # If the re-mapping fails, we give up if updated_pos is None: return None # Otherwise, we update to the mapped position updated_pos_1x = str(updated_pos + 1) mapped_site = MappedSite(orig_id, False, res, pos, mapped_id=pspmapping.mapped_id, mapped_res=pspmapping.mapped_res, mapped_pos=updated_pos_1x, # Switch to 1-indexed description='REMAPPED_FROM_PSP_SEQUENCE', gene_name=gene_name) site_key = (orig_id, res, pos) self._cache[site_key] = mapped_site return mapped_site
@staticmethod def motif_from_position(up_id, pos, window=7): seq = uniprot_client.get_sequence(up_id) return ProtMapper.motif_from_position_seq(seq, pos, window) @staticmethod def motif_from_position_seq(seq, pos, window=7): # Validate that the position is an integer pos_str = str(pos) pos_int = int(pos) end_ix = pos_int + window if pos_int + window < len(seq) else len(seq) start_ix = pos_int - window - 1 if pos_int - window > 0 else 0 motif = seq[start_ix:end_ix] # Get the residue position, which will be the same as the window # size unless the start_ix is something other than the pos - window respos = pos_int - start_ix return (motif, respos) @staticmethod def map_peptide(target_up_id, peptide, pos): seq = uniprot_client.get_sequence(target_up_id) peptide_start = seq.find(peptide) # Peptide not found in sequence, return None if peptide_start == -1: return None target_pos = peptide_start + pos return target_pos
[docs] @staticmethod def map_peptide_to_human_ref(prot_id, prot_ns, peptide, site_pos): """Return a mapped site for a given peptide. Parameters ---------- prot_id : str A Uniprot ID or HGNC gene symbol for the protein. prot_ns : str One of 'uniprot' or 'hgnc' indicating the type of ID given. peptide : str A string of amino acid symbols representing a peptide. site_pos : int A site position within the peptide. Note: site_pos is 1-indexed. Returns ------- MappedSite The MappedSite object gives information on results of mapping the site. See :py:class:`protmapper.api.MappedSite` documentation for details. """ # Get the uniprot ID for the gene # Check the protein ID and namespace if prot_id is None: raise ValueError("prot_id must not be None.") if prot_ns not in ('uniprot', 'hgnc'): raise ValueError("prot_ns must be either 'uniprot' or 'hgnc' (for " "HGNC symbols)") if prot_ns == 'uniprot' and len(prot_id.split('-')) != 1 and \ prot_id.split('-')[1] != '1': raise ValueError("Protein ID passed in appears to be a " "non-reference isoform: %s" % prot_id) # Get Uniprot ID and gene name up_id = _get_uniprot_id(prot_id, prot_ns) # If an HGNC ID was given and the uniprot entry is not found, flag # as error if up_id is None: assert prot_ns == 'hgnc' and prot_id is not None return MappedSite(None, None, None, None, gene_name=prot_id, error_code='NO_UNIPROT_ID') # Get the gene name from Uniprot gene_name = uniprot_client.get_gene_name(up_id) mapped_pos = ProtMapper.map_peptide(up_id, peptide, site_pos) ms = MappedSite(up_id=up_id, valid=None, orig_res=None, orig_pos=None, error_code=None, description=None, gene_name=gene_name) if mapped_pos is None: ms.valid = False else: ms.valid = True ms.mapped_id = up_id ms.mapped_res = peptide[site_pos - 1] ms.mapped_pos = str(mapped_pos) return ms
[docs]def load_site_map(path): """Load the modification site map from a file. The site map file should be a comma-separated file with six columns:: UniprotId: Uniprot ID of protein Gene: Gene name OrigRes: Original (incorrect) residue OrigPos: Original (incorrect) residue position CorrectRes: The correct residue for the modification CorrectPos: The correct residue position Comment: Description of the reason for the error. Parameters ---------- path : string Path to the tab-separated site map file. Returns ------- dict A dict mapping tuples of the form `(uniprot_id, orig_res, orig_pos)` to a tuple of the form `(correct_res, correct_pos, comment)`, where `uniprot_id` is the Uniprot ID of the protein; `orig_res` and `orig_pos` are the residue and position to be mapped; `correct_res` and `correct_pos` are the corrected residue and position, and `comment` is a string describing the reason for the mapping (species error, isoform error, wrong residue name, etc.). """ site_map = {} with open(path, 'r') as fh: maprows = csv.reader(fh) # Skip the header line next(maprows) for row in maprows: # Don't allow empty entries in the key section if not (row[0] and row[2] and row[3]): raise Exception("Entries in the key (gene, residue, position) " "may not be empty.") correct_res = row[4].strip() if row[4] else None correct_pos = row[5].strip() if row[5] else None comment = row[6].strip() if row[6] else None site_map[(row[0].strip(), row[2].strip(), row[3].strip())] = \ (correct_res, correct_pos, comment) return site_map
def _validate_site(residue, position): if residue is None: raise InvalidSiteException('residue cannot be None') if position is None: raise InvalidSiteException('position cannot be None') # Check that the residue is a valid amino acid if residue not in valid_aas: raise InvalidSiteException('Residue %s not a valid amino acid' % residue) # Next make sure that the position is a valid position try: pos_int = int(position) pos_str = str(pos_int) # Catch the ValueError and re-raise as an InvalidSiteException except ValueError: raise InvalidSiteException('Position %s not a valid sequence position.' % position) # Site appears valid, make a Site object return (residue, pos_str) def _get_uniprot_id(prot_id, prot_ns): """Get the Uniprot ID for an agent, looking up in HGNC if necessary. If the Uniprot ID is a list then return the first ID by default. """ # If the ID is a Uniprot ID, then we're done if prot_ns == 'uniprot': up_id = prot_id # Otherwise, we get the Uniprot ID from the HGNC name elif prot_ns == 'hgnc': # Get the HGNC ID hgnc_id = hgnc_name_to_id.get(prot_id) if not hgnc_id: return None # Try to get UniProt ID from HGNC up_id = hgnc_id_to_up.get(hgnc_id) # If the UniProt ID is a list then choose the first one. parts = up_id.split(', ') if len(parts) > 1: up_id = parts[0] return up_id default_site_map_path = os.path.join(os.path.dirname(__file__), 'curated_site_map.csv') default_site_map = load_site_map(default_site_map_path) default_mapper = ProtMapper(default_site_map) """A default instance of :py:class:`ProtMapper` that contains the site information found in resources/curated_site_map.csv'.""" hgnc_name_to_id, hgnc_id_to_up, up_to_hgnc_id, _, _ = \ uniprot_client._build_hgnc_mappings()