Source code for protmapper.phosphosite_client

import csv
import gzip
import logging
from os.path import dirname, abspath, join
from collections import namedtuple, defaultdict
from protmapper.resources import resource_manager


logger = logging.getLogger(__name__)


PhosphoSite = namedtuple('PhosphoSite',
                         ['GENE', 'PROTEIN', 'ACC_ID', 'HU_CHR_LOC', 'MOD_RSD',
                          'SITE_GRP_ID', 'ORGANISM', 'MW_kD', 'DOMAIN',
                          'SITE_7_AA', 'LT_LIT', 'MS_LIT', 'MS_CST', 'CST_CAT'])

PspMapping = namedtuple('PspMapping',
                        ['mapped_id', 'mapped_res', 'mapped_pos', 'motif',
                         'respos'])

_data_by_up = None
_data_by_site_grp = None
_has_data = None



# An explicit mapping of Uniprot IDs representing reference isoforms of their
# respective proteins in cases where there is ambiguity (i.e., other isoforms
# have IDs without a hyphen).
_iso_to_ref_map = {
    'J3KPC8': 'Q9Y2K2', # SIK3/QSK
    'Q5W9H1': 'Q14155', # ARHGEF7
    'A4QN18': 'O15027', # SEC16A
    'J3KNL6': 'O15027', # SEC16A
    'NP_001184222': 'Q16555', # DPYSL2/CRMP-2
}


# An easily indexed list of the reference isoforms mapped to, above
_ref_isoforms = set(_iso_to_ref_map.values())



[docs]def has_data():
    """Check if the PhosphoSite data is available and can be loaded.

    Returns
    -------
    bool
        True if the data can be loaded, False otherwise.
    """
    global _has_data
    if _has_data is None:
        try:
            _get_phospho_site_dataset()
            # If we succeeded without exception, then we set _has_data to True
            _has_data = True
        except Exception as e:
            logger.info("Could not load PhosphoSite data from file %s" %
                         resource_manager.get_resource_file('psp'))
            logger.info("Source Exception: %s" % e)
            _has_data = False
    return _has_data


def _get_phospho_site_dataset():
    """Read phosphosite data into dicts keyed by Uniprot ID and by site group.

    Returns
    -------
    tuple
        The first element of the tuple contains the PhosphoSite data keyed
        by Uniprot ID, the second element contains data keyed by site group.
        Both dicts have instances of the PhosphoSite namedtuple as values.
        If the PhosphoSite data file cannot be loaded, returns (None, None).
    """
    global _data_by_up
    global _data_by_site_grp
    phosphosite_data_file = resource_manager.get_create_resource_file('psp')
    if _data_by_up is None or _data_by_site_grp is None:
        with gzip.open(phosphosite_data_file, 'rt', encoding='utf-8') as fh:
            # Get the csv reader generator
            reader = csv.reader(fh, delimiter='\t')
            # Skip 4 rows
            for _ in range(4):
                next(reader)
            # Build up a dict by protein
            data_by_up = defaultdict(lambda: defaultdict(list))
            data_by_site_grp = defaultdict(list)
            for row in reader:
                site = PhosphoSite(*row)
                res_pos = site.MOD_RSD.split('-')[0]
                #res_pos = res_pos[1:] # DANGEROUS: lookup based on pos alone
                base_acc_id = site.ACC_ID.split('-')[0]
                data_by_up[site.ACC_ID][res_pos].append(site)
                # If the ID was isoform specific, add to the dict for the whole
                # protein
                if base_acc_id != site.ACC_ID:
                    data_by_up[base_acc_id][res_pos].append(site)
                # Catch the handful of isoforms that have a Uniprot ID without
                # the hyphen
                elif site.ACC_ID in _iso_to_ref_map:
                    ref_id = _iso_to_ref_map[site.ACC_ID]
                    data_by_up[ref_id][res_pos].append(site)
                # To catch additional cases, include an entry for the -1 base ID
                else:
                    data_by_up['%s-1' % base_acc_id] = data_by_up[base_acc_id]
                data_by_site_grp[site.SITE_GRP_ID].append(site)
            _data_by_up = data_by_up
            _data_by_site_grp = data_by_site_grp
    return (_data_by_up, _data_by_site_grp)


[docs]def map_to_human_site(up_id, mod_res, mod_pos):
    """Find site on human ref seq corresponding to (possibly non-human) site.

    Parameters
    ----------
    up_id : str
        Uniprot ID of the modified protein (generally human, rat, or mouse).
    mod_res : str
        Modified amino acid residue.
    mod_pos : str
        Amino acid sequence position.

    Returns
    -------
    str
        Returns amino acid position on the human reference sequence
        corresponding to the site on the given protein.
    """
    (data_by_up, data_by_site_grp) = _get_phospho_site_dataset()
    sites_for_up = data_by_up.get(up_id)
    # No info in Phosphosite for this Uniprot ID
    if not sites_for_up:
        return None
    site_info_list = sites_for_up.get('%s%s' % (mod_res, mod_pos))
    # DANGER: lookup based on pos alone
    # site_info_list = sites_for_up.get('%s' % mod_pos)
    # If this site doesn't exist for this protein, will return an empty list
    if not site_info_list:
        return None
    # At this point site_info_list contains a list of PhosphoSite objects
    # for the given protein with an entry for the given site, however, they
    # may be different isoforms; they may also not contain the reference
    # isoform; or they may only contain a single isoform.
    # If it's a single isoform, take it!
    if len(site_info_list) == 1:
        site_grp = site_info_list[0].SITE_GRP_ID
    # If there is more than one entry for this site, take the one from the
    # reference sequence, if it's in there (for example, a site that is present
    # in a mouse isoform as well as the mouse reference sequence)
    elif len(site_info_list) > 0:
        logger.debug('More than one entry in PhosphoSite for %s, site %s%s' %
                    (up_id, mod_res, mod_pos))
        ref_site_info = None
        si_by_grp = defaultdict(list)
        for si in site_info_list:
            logger.debug('\tSite info: acc_id %s, site_grp_id %s' %
                        (si.ACC_ID, si.SITE_GRP_ID))
            si_by_grp[si.SITE_GRP_ID].append(si)
            if si.ACC_ID == up_id:
                ref_site_info = si
        # If the reference sequence is not there but we have more than one
        # site_info in the list, this means there is more than one isoform
        # that has a phosphorylated site at the given position. The main thing
        # is whether there is more than one site group.
        if ref_site_info is None:
            # If there's only one unique site group, take it
            if len(si_by_grp) == 1:
                site_grp = site_info_list[0].SITE_GRP_ID
            # Otherwise, take the first
            else:
                logger.warning('More than one non-reference site group found '
                               'for (%s, %s, %s), not mapping' %
                               (up_id, mod_res, mod_pos))
                return None
        else:
            site_grp = ref_site_info.SITE_GRP_ID
    # Look up site group
    site_grp_list = data_by_site_grp[site_grp]
    # If an empty list, then return None (is unlikely to happen)
    if not site_grp_list:
        return None
    # Check for a human protein in the list
    human_sites = [s for s in site_grp_list if s.ORGANISM == 'human']
    if not human_sites:
        logger.debug("No corresponding human site for %s, choosing first: %s" %
                     (site_grp, site_grp_list[0].ORGANISM))
        #ref_site = site_grp_list[0]
        return None
    # If there are multiple isoforms, choose the base one
    # (no hyphen in the accession ID)
    elif len(human_sites) > 1:
        # In general we assume that multiple human sites only arise from
        # multiple isoforms of the same protein, which will share an accession
        # ID, and that only one of these will be the reference sequence (with
        # no hyphen).  However, in a small number of cases the isoform-specific
        # identifier will have a Uniprot ID without the hyphen, e.g. J3KPC8
        # for QSK iso5 (ref protein with ID Q9Y2K2).
        base_id_sites = [site for site in human_sites
                         if site.ACC_ID.find('-') == -1 and
                            site.ACC_ID not in _iso_to_ref_map]
        if base_id_sites:
            if len(base_id_sites) != 1:
                logger.warning("There is more than one apparent ref seq, "
                               "choosing first: %s" % base_id_sites)
            ref_site = base_id_sites[0]
        # There is no base ID site, i.e., all mapped sites are for specific
        # isoforms only, so skip it
        else:
            logger.info('Human isoform matches, but no human ref seq match '
                        'for %s %s %s; choosing first' %
                        (up_id, mod_res, mod_pos))
            ref_site = human_sites[0]
    # If there is only one human site, take it
    else:
        ref_site = human_sites[0]
    mapped_id = ref_site.ACC_ID
    human_site_str = ref_site.MOD_RSD.split('-')[0]
    human_res = human_site_str[0]
    human_pos = human_site_str[1:]
    motif, respos = _normalize_site_motif(ref_site.SITE_7_AA)
    pspmapping = PspMapping(mapped_id=mapped_id, mapped_res=human_res,
                            mapped_pos=human_pos,
                            motif=motif, respos=respos)
    return pspmapping


[docs]def sites_only(exclude_isoforms=False):
    """Return PhosphositePlus data as a flat list of proteins and sites.

    Parameters
    ----------
    exclude_isoforms : bool
        Whether to exclude sites for protein isoforms. Default is False
        (includes isoforms).

    Returns
    -------
    list of tuples
        Each tuple consists of (uniprot_id, residue, position).
    """
    sites = []
    (data_by_up, data_by_site_grp) = _get_phospho_site_dataset()
    for up_id, respos_dict in data_by_up.items():
        if exclude_isoforms and '-' in up_id:
            continue
        for respos in respos_dict.keys():
            res = respos[0]
            pos = respos[1:]
            sites.append((up_id, res, pos))
    return sites


def _normalize_site_motif(motif):
    """Normalize the PSP site motif to all caps with no underscores and return
    the preprocessed motif sequence and the position of the target residue
    in the motif (zero-indexed)."""
    no_underscores = motif.replace('_', '')
    offset = motif.find(no_underscores)
    respos = 7 - offset
    return (no_underscores.upper(), respos)