Source code for protmapper.phosphosite_client

import csv
import gzip
import logging
from os.path import dirname, abspath, join
from collections import namedtuple, defaultdict
from protmapper.resources import resource_manager


logger = logging.getLogger(__name__)


PhosphoSite = namedtuple('PhosphoSite',
                         ['GENE', 'PROTEIN', 'ACC_ID', 'HU_CHR_LOC', 'MOD_RSD',
                          'SITE_GRP_ID', 'ORGANISM', 'MW_kD', 'DOMAIN',
                          'SITE_7_AA', 'LT_LIT', 'MS_LIT', 'MS_CST', 'CST_CAT'])

PspMapping = namedtuple('PspMapping',
                        ['mapped_id', 'mapped_res', 'mapped_pos', 'motif',
                         'respos'])

_data_by_up = None
_data_by_site_grp = None
_has_data = None



# An explicit mapping of Uniprot IDs representing reference isoforms of their
# respective proteins in cases where there is ambiguity (i.e., other isoforms
# have IDs without a hyphen).
_iso_to_ref_map = {
    'J3KPC8': 'Q9Y2K2', # SIK3/QSK
    'Q5W9H1': 'Q14155', # ARHGEF7
    'A4QN18': 'O15027', # SEC16A
    'J3KNL6': 'O15027', # SEC16A
    'NP_001184222': 'Q16555', # DPYSL2/CRMP-2
}


# An easily indexed list of the reference isoforms mapped to, above
_ref_isoforms = set(_iso_to_ref_map.values())



[docs]def has_data(): """Check if the PhosphoSite data is available and can be loaded. Returns ------- bool True if the data can be loaded, False otherwise. """ global _has_data if _has_data is None: try: _get_phospho_site_dataset() # If we succeeded without exception, then we set _has_data to True _has_data = True except Exception as e: logger.info("Could not load PhosphoSite data from file %s" % resource_manager.get_resource_file('psp')) logger.info("Source Exception: %s" % e) _has_data = False return _has_data
def _get_phospho_site_dataset(): """Read phosphosite data into dicts keyed by Uniprot ID and by site group. Returns ------- tuple The first element of the tuple contains the PhosphoSite data keyed by Uniprot ID, the second element contains data keyed by site group. Both dicts have instances of the PhosphoSite namedtuple as values. If the PhosphoSite data file cannot be loaded, returns (None, None). """ global _data_by_up global _data_by_site_grp phosphosite_data_file = resource_manager.get_create_resource_file('psp') if _data_by_up is None or _data_by_site_grp is None: with gzip.open(phosphosite_data_file, 'rt', encoding='utf-8') as fh: # Get the csv reader generator reader = csv.reader(fh, delimiter='\t') # Skip 4 rows for _ in range(4): next(reader) # Build up a dict by protein data_by_up = defaultdict(lambda: defaultdict(list)) data_by_site_grp = defaultdict(list) for row in reader: site = PhosphoSite(*row) res_pos = site.MOD_RSD.split('-')[0] #res_pos = res_pos[1:] # DANGEROUS: lookup based on pos alone base_acc_id = site.ACC_ID.split('-')[0] data_by_up[site.ACC_ID][res_pos].append(site) # If the ID was isoform specific, add to the dict for the whole # protein if base_acc_id != site.ACC_ID: data_by_up[base_acc_id][res_pos].append(site) # Catch the handful of isoforms that have a Uniprot ID without # the hyphen elif site.ACC_ID in _iso_to_ref_map: ref_id = _iso_to_ref_map[site.ACC_ID] data_by_up[ref_id][res_pos].append(site) # To catch additional cases, include an entry for the -1 base ID else: data_by_up['%s-1' % base_acc_id] = data_by_up[base_acc_id] data_by_site_grp[site.SITE_GRP_ID].append(site) _data_by_up = data_by_up _data_by_site_grp = data_by_site_grp return (_data_by_up, _data_by_site_grp)
[docs]def map_to_human_site(up_id, mod_res, mod_pos): """Find site on human ref seq corresponding to (possibly non-human) site. Parameters ---------- up_id : str Uniprot ID of the modified protein (generally human, rat, or mouse). mod_res : str Modified amino acid residue. mod_pos : str Amino acid sequence position. Returns ------- str Returns amino acid position on the human reference sequence corresponding to the site on the given protein. """ (data_by_up, data_by_site_grp) = _get_phospho_site_dataset() sites_for_up = data_by_up.get(up_id) # No info in Phosphosite for this Uniprot ID if not sites_for_up: return None site_info_list = sites_for_up.get('%s%s' % (mod_res, mod_pos)) # DANGER: lookup based on pos alone # site_info_list = sites_for_up.get('%s' % mod_pos) # If this site doesn't exist for this protein, will return an empty list if not site_info_list: return None # At this point site_info_list contains a list of PhosphoSite objects # for the given protein with an entry for the given site, however, they # may be different isoforms; they may also not contain the reference # isoform; or they may only contain a single isoform. # If it's a single isoform, take it! if len(site_info_list) == 1: site_grp = site_info_list[0].SITE_GRP_ID # If there is more than one entry for this site, take the one from the # reference sequence, if it's in there (for example, a site that is present # in a mouse isoform as well as the mouse reference sequence) elif len(site_info_list) > 0: logger.debug('More than one entry in PhosphoSite for %s, site %s%s' % (up_id, mod_res, mod_pos)) ref_site_info = None si_by_grp = defaultdict(list) for si in site_info_list: logger.debug('\tSite info: acc_id %s, site_grp_id %s' % (si.ACC_ID, si.SITE_GRP_ID)) si_by_grp[si.SITE_GRP_ID].append(si) if si.ACC_ID == up_id: ref_site_info = si # If the reference sequence is not there but we have more than one # site_info in the list, this means there is more than one isoform # that has a phosphorylated site at the given position. The main thing # is whether there is more than one site group. if ref_site_info is None: # If there's only one unique site group, take it if len(si_by_grp) == 1: site_grp = site_info_list[0].SITE_GRP_ID # Otherwise, take the first else: logger.warning('More than one non-reference site group found ' 'for (%s, %s, %s), not mapping' % (up_id, mod_res, mod_pos)) return None else: site_grp = ref_site_info.SITE_GRP_ID # Look up site group site_grp_list = data_by_site_grp[site_grp] # If an empty list, then return None (is unlikely to happen) if not site_grp_list: return None # Check for a human protein in the list human_sites = [s for s in site_grp_list if s.ORGANISM == 'human'] if not human_sites: logger.debug("No corresponding human site for %s, choosing first: %s" % (site_grp, site_grp_list[0].ORGANISM)) #ref_site = site_grp_list[0] return None # If there are multiple isoforms, choose the base one # (no hyphen in the accession ID) elif len(human_sites) > 1: # In general we assume that multiple human sites only arise from # multiple isoforms of the same protein, which will share an accession # ID, and that only one of these will be the reference sequence (with # no hyphen). However, in a small number of cases the isoform-specific # identifier will have a Uniprot ID without the hyphen, e.g. J3KPC8 # for QSK iso5 (ref protein with ID Q9Y2K2). base_id_sites = [site for site in human_sites if site.ACC_ID.find('-') == -1 and site.ACC_ID not in _iso_to_ref_map] if base_id_sites: if len(base_id_sites) != 1: logger.warning("There is more than one apparent ref seq, " "choosing first: %s" % base_id_sites) ref_site = base_id_sites[0] # There is no base ID site, i.e., all mapped sites are for specific # isoforms only, so skip it else: logger.info('Human isoform matches, but no human ref seq match ' 'for %s %s %s; choosing first' % (up_id, mod_res, mod_pos)) ref_site = human_sites[0] # If there is only one human site, take it else: ref_site = human_sites[0] mapped_id = ref_site.ACC_ID human_site_str = ref_site.MOD_RSD.split('-')[0] human_res = human_site_str[0] human_pos = human_site_str[1:] motif, respos = _normalize_site_motif(ref_site.SITE_7_AA) pspmapping = PspMapping(mapped_id=mapped_id, mapped_res=human_res, mapped_pos=human_pos, motif=motif, respos=respos) return pspmapping
[docs]def sites_only(exclude_isoforms=False): """Return PhosphositePlus data as a flat list of proteins and sites. Parameters ---------- exclude_isoforms : bool Whether to exclude sites for protein isoforms. Default is False (includes isoforms). Returns ------- list of tuples Each tuple consists of (uniprot_id, residue, position). """ sites = [] (data_by_up, data_by_site_grp) = _get_phospho_site_dataset() for up_id, respos_dict in data_by_up.items(): if exclude_isoforms and '-' in up_id: continue for respos in respos_dict.keys(): res = respos[0] pos = respos[1:] sites.append((up_id, res, pos)) return sites
def _normalize_site_motif(motif): """Normalize the PSP site motif to all caps with no underscores and return the preprocessed motif sequence and the position of the target residue in the motif (zero-indexed).""" no_underscores = motif.replace('_', '') offset = motif.find(no_underscores) respos = 7 - offset return (no_underscores.upper(), respos)