import gzip
import os
import re
import csv
import zlib
import json
import boto3
import pystow
import logging
import argparse
import requests
import botocore
from ftplib import FTP
from io import BytesIO, StringIO
from collections import namedtuple
from urllib.request import urlretrieve
from xml.etree import ElementTree as ET
from . import __version__
logger = logging.getLogger('protmapper.resources')
logger.setLevel(logging.INFO)
# If the protmapper resource directory does not exist, try to create it using PyStow
# Can be specified with PROTMAPPER_HOME environment variable, otherwise defaults
# to $HOME/.data/protmapper/<__version__>. The location of $HOME can be overridden with
# the PYSTOW_HOME environment variable
resource_dir_path = pystow.join('protmapper', __version__)
resource_dir = resource_dir_path.as_posix()
def _download_from_s3(key, out_file):
s3 = boto3.client('s3',
config=botocore.client.Config(
signature_version=botocore.UNSIGNED))
tc = boto3.s3.transfer.TransferConfig(use_threads=False)
# Path to the versioned resource file
full_key = 'protmapper/%s/%s' % (__version__, key)
s3.download_file('bigmech', full_key, out_file, Config=tc)
def _download_ftp(ftp_host, ftp_path, out_file=None, ftp_blocksize=33554432,
decompress=True):
ftp = FTP(ftp_host)
ftp.login()
fbytes = BytesIO()
ftp.retrbinary('RETR %s' % ftp_path,
callback=lambda s: fbytes.write(s),
blocksize=ftp_blocksize)
ret = fbytes.getvalue()
if decompress:
ret = zlib.decompress(ret, 16+zlib.MAX_WBITS)
if out_file is not None:
with open(out_file, 'wb') as f:
f.write(ret)
return ret
def download_phosphositeplus(out_file, cached=True):
if not cached:
logger.info('Cannot download PSP data without using the cache.')
return
logger.info("Note that PhosphoSitePlus data is not available for "
"commercial use; please see full terms and conditions at: "
"https://www.psp.org/staticDownloads")
_download_from_s3('Phosphorylation_site_dataset.tsv.gz', out_file)
def download_uniprot_entries(out_file, cached=True):
if cached:
_download_from_s3('uniprot_entries.tsv.gz', out_file)
return
base_columns = [
'accession', # 'id',
'gene_primary', # 'genes(PREFERRED)',
'id', # 'entry%20name',
'xref_rgd', # 'database(RGD)',
'xref_mgi', # 'database(MGI)',
'length', # 'length',
'reviewed', # 'reviewed',
'organism_id', # 'organism-id',
'xref_geneid', # 'database(GeneID)'
]
processed_columns = [
'gene_names', # 'genes',
'protein_name', # 'protein%20names'
]
feature_types = {
'ft_signal': 'SIGNAL', # 'SIGNAL',
'ft_chain': 'CHAIN', # 'CHAIN',
'ft_propep': 'PROPEPTIDE', # 'PROPEPTIDE',
'ft_peptide': 'PEPTIDE', # 'PEPTIDE',
'ft_transit': 'TRANSIT', # 'TRANSIT',
}
feature_columns = list(feature_types)
columns = base_columns + processed_columns + feature_columns
columns_str = ','.join(columns)
logger.info('Downloading UniProt entries')
url = 'https://rest.uniprot.org/uniprotkb/stream?' \
'format=tsv&' \
'query=reviewed:true&' \
'compressed=true&' \
'sort=accession asc&' \
'fields=' + columns_str
#url = 'http://www.uniprot.org/uniprot/?' + \
# 'sort=id&desc=no&compress=no&query=reviewed:yes&' + \
# 'format=tab&columns=' + columns_str
logger.info('Downloading %s' % url)
res = requests.get(url)
if res.status_code != 200:
logger.info('Failed to download "%s"' % url)
reviewed_entries = gzip.decompress(res.content).decode('utf-8')
url = 'https://rest.uniprot.org/uniprotkb/stream?' \
'format=tsv&' \
'query=organism_id:9606 AND (reviewed:false)&' \
'compressed=true&' \
'sort=accession asc&' \
'fields=' + columns_str
#url = 'http://www.uniprot.org/uniprot/?' + \
# 'sort=id&desc=no&compress=no&query=reviewed:no&fil=organism:' + \
# '%22Homo%20sapiens%20(Human)%20[9606]%22&' + \
# 'format=tab&columns=' + columns_str
logger.info('Downloading %s' % url)
res = requests.get(url)
if res.status_code != 200:
logger.info('Failed to download "%s"' % url)
unreviewed_human_entries = gzip.decompress(res.content).decode('utf-8')
if not((reviewed_entries is not None) and
(unreviewed_human_entries is not None)):
return
lines = reviewed_entries.strip('\n').split('\n')
lines += unreviewed_human_entries.strip('\n').split('\n')[1:]
logger.info('Processing UniProt entries list.')
new_lines = ['\t'.join(base_columns + processed_columns + ['features'])]
for line_idx, line in enumerate(lines):
if line_idx == 0:
continue
new_line = process_uniprot_line(line, base_columns, processed_columns,
list(feature_types.values()))
new_lines.append(new_line)
# Join all lines into a single string
full_table = '\n'.join(new_lines)
logger.info('Saving into %s.' % out_file)
with gzip.open(out_file, 'wt', encoding='utf-8') as fh:
fh.write(full_table)
def process_uniprot_line(line, base_columns, processed_columns,
feature_types):
terms = line.split('\t')
terms[4] = terms[4].replace('MGI:', '')
# At this point, we need to clean up the gene names.
# If there are multiple gene names, take the first one
gene_names_preferred = terms[1].split(';')
gene_name = gene_names_preferred[0]
if not gene_name:
gene_name = terms[len(base_columns)].split(' ')[0]
protein_names = parse_uniprot_synonyms(terms[len(base_columns)+1])
protein_name = protein_names[0] if protein_names else None
if gene_name:
terms[1] = gene_name
elif protein_name:
terms[1] = protein_name
else:
terms[1] = None
# We only add Entrez IDs for reviewed entries to avoid the problem
# caused by one-to-many mappings with lots of unreviewed proteins
terms[8] = '' if terms[6] != 'reviewed' else terms[8]
# Next we process the various features into a form that can be
# loaded easily in the client
features = []
for idx, feature_type in enumerate(feature_types):
col_idx = len(base_columns) + len(processed_columns) + idx
features += _process_feature(feature_type, terms[col_idx],
protein_name)
features_json = [feature_to_json(feature) for feature in features]
features_json_str = json.dumps(features_json)
new_line = terms[:len(base_columns)] + [features_json_str]
return '\t'.join(new_line)
def parse_uniprot_synonyms(synonyms_str):
synonyms_str = re.sub(r'\[Includes: ([^]])+\]',
'', synonyms_str).strip()
synonyms_str = re.sub(r'\[Cleaved into: ([^]])+\]( \(Fragments\))?',
'', synonyms_str).strip()
def find_block_from_right(s):
parentheses_depth = 0
assert s.endswith(')')
s = s[:-1]
block = ''
for c in s[::-1]:
if c == ')':
parentheses_depth += 1
elif c == '(':
if parentheses_depth > 0:
parentheses_depth -= 1
else:
return block
block = c + block
return block
syns = []
while True:
if not synonyms_str:
return syns
if not synonyms_str.endswith(')'):
return [synonyms_str] + syns
syn = find_block_from_right(synonyms_str)
synonyms_str = synonyms_str[:-len(syn)-3]
# EC codes are not valid synonyms
if not re.match(r'EC [\d\.-]+', syn):
syns = [syn] + syns
Feature = namedtuple('Feature', ['type', 'begin', 'end', 'name', 'id',
'is_main'])
def feature_to_json(feature):
jj = {
'type': feature.type,
'begin': feature.begin,
'end': feature.end,
'name': feature.name,
'id': feature.id
}
if feature.is_main:
jj['is_main'] = True
return jj
def feature_from_json(feature_json):
if 'is_main' not in feature_json:
feature_json['is_main'] = False
return Feature(**feature_json)
def _process_feature(feature_type, feature_str, protein_name):
"""Process a feature string from the UniProt TSV.
Documentation at: https://www.uniprot.org/help/sequence_annotation
"""
# This function merges parts that were split inadvertently on semicolons
def _fix_parts(parts):
for idx, part in enumerate(parts):
if part.startswith('/') and not part.endswith('"'):
parts[idx] += parts[idx+1]
parts = [p for idx, p in enumerate(parts) if idx != idx+1]
return parts
# Split parts and strip off extra spaces
parts = [p.strip(' ;') for p in feature_str.split('; ')]
parts = _fix_parts(parts)
# Find each starting part e.g., CHAIN
chunk_ids = [idx for idx, part in enumerate(parts)
if part.startswith(feature_type)] + [len(parts)]
# Group parts into chunks, one for each overall entry
chunks = []
for idx, chunk_id in enumerate(chunk_ids[:-1]):
chunks.append(parts[chunk_ids[idx]:chunk_ids[idx+1]])
feats = []
# For each distinct entry, we collect all the relevant parts and parse
# out information
for chunk in chunks:
begin = end = name = pid = None
for part in chunk:
# If this is the starting piece, we have to parse out the begin
# and end coordinates. Caveats include: sometimes only one
# number is given; sometimes a ? is there instead of a number;
# sometimes a question mark precedes a number; sometimes
# there is a < before the beginning number; sometimes there
# is a > before the end number. Sometimes there is an isoform
# before the beginning number.
if part.startswith(feature_type):
match = re.match(r'%s ' # type marker
r'(?:[^:]+:)?(?:\?|<?)(\d+|\?)' # beginning
r'..' # connector
r'(?:\?|>?)(\d+|\?)' % # end
feature_type, part)
if match:
beg, end = match.groups()
else:
# This is the standard begin marker
match = re.match(r'%s (\d+)' % feature_type, part)
beg = match.groups()[0]
end = beg
begin = int(beg) if beg != '?' else None
end = int(end) if end != '?' else None
elif part.startswith('/note'):
match = re.match(r'/note="(.+)"', part)
name = match.groups()[0]
elif part.startswith('/id'):
match = re.match(r'/id="(.+)"', part)
pid = match.groups()[0]
is_main = (name == protein_name)
feature = Feature(feature_type, begin, end, name, pid, is_main)
feats.append(feature)
return feats
def download_uniprot_sec_ac(out_file, cached=True):
if cached:
_download_from_s3('uniprot_sec_ac.txt.gz', out_file)
return
logger.info('Downloading UniProt secondary accession mappings')
url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/' + \
'complete/docs/sec_ac.txt'
content = _download_ftp('ftp.uniprot.org',
'pub/databases/uniprot/knowledgebase/complete/'
'docs/sec_ac.txt',
decompress=False, out_file=None)
with gzip.open(out_file, 'wb') as fh:
fh.write(content)
def download_hgnc_entries(out_file, cached=True):
if cached:
_download_from_s3('hgnc_entries.tsv.gz', out_file)
return
logger.info('Downloading HGNC entries')
url = 'http://tinyurl.com/y83dx5s6'
res = requests.get(url)
if res.status_code != 200:
logger.error('Failed to download "%s"' % url)
return
logger.info('Saving into %s' % out_file)
with gzip.open(out_file, 'wt', encoding='utf-8') as fh:
fh.write(res.text)
def download_swissprot(out_file, cached=True):
if cached:
_download_from_s3('uniprot_sprot.fasta.gz', out_file)
return
logger.info('Downloading reviewed protein sequences from SwissProt')
ftp_path = ('/pub/databases/uniprot/current_release/knowledgebase/'
'complete/uniprot_sprot.fasta.gz')
_download_ftp('ftp.uniprot.org', ftp_path, out_file, decompress=False)
def download_isoforms(out_file, cached=True):
if cached:
_download_from_s3('uniprot_sprot_varsplic.fasta.gz', out_file)
return
logger.info('Downloading isoform sequences from Uniprot')
ftp_path = ('/pub/databases/uniprot/current_release/knowledgebase/'
'complete/uniprot_sprot_varsplic.fasta.gz')
_download_ftp('ftp.uniprot.org', ftp_path, out_file, decompress=False)
def download_refseq_seq(out_file, cached=True):
if cached:
_download_from_s3('refseq_sequence.fasta.gz', out_file)
return
ftp_path = ('/refseq/H_sapiens/annotation/GRCh38_latest/'
'refseq_identifiers/GRCh38_latest_protein.faa.gz')
_download_ftp('ftp.ncbi.nlm.nih.gov', ftp_path, out_file,
decompress=False)
def download_refseq_uniprot(out_file, cached=True):
if cached:
_download_from_s3('refseq_uniprot.csv.gz', out_file)
return
logger.info('Downloading RefSeq->Uniprot mappings from Uniprot')
ftp_path = ('/pub/databases/uniprot/current_release/knowledgebase/'
'idmapping/by_organism/HUMAN_9606_idmapping.dat.gz')
mappings_bytes = _download_ftp('ftp.uniprot.org', ftp_path,
out_file=None, decompress=True)
logger.info('Processing RefSeq->Uniprot mappings file')
mappings_io = StringIO(mappings_bytes.decode('utf8'))
csvreader = csv.reader(mappings_io, delimiter='\t')
filt_rows = []
for up_id, other_type, other_id in csvreader:
if other_type == 'RefSeq':
filt_rows.append([other_id, up_id])
# Write the file with just the RefSeq->UP mappings
with gzip.open(out_file, 'wt', encoding='utf-8') as fh:
writer = csv.writer(fh)
writer.writerows(filt_rows)
RESOURCE_MAP = {
'hgnc': ('hgnc_entries.tsv.gz', download_hgnc_entries),
'upsec': ('uniprot_sec_ac.txt.gz', download_uniprot_sec_ac),
'up': ('uniprot_entries.tsv.gz', download_uniprot_entries),
'psp': ('Phosphorylation_site_dataset.tsv.gz', download_phosphositeplus),
'swissprot': ('uniprot_sprot.fasta.gz', download_swissprot),
'isoforms': ('uniprot_sprot_varsplic.fasta.gz', download_isoforms),
'refseq_uniprot': ('refseq_uniprot.csv.gz', download_refseq_uniprot),
'refseq_seq': ('refseq_sequence.fasta.gz', download_refseq_seq),
}
[docs]class ResourceManager(object):
"""Class to manage a set of resource files.
Parameters
----------
resource_map : dict
A dict that maps resource file IDs to a tuple of resource file names
and download functions.
"""
def __init__(self, resource_map):
self.resource_map = resource_map
[docs] def get_resource_file(self, resource_id):
"""Return the path to the resource file with the given ID.
Parameters
----------
resource_id : str
The ID of the resource.
Returns
-------
str
The path to the resource file.
"""
return os.path.join(resource_dir_path, self.resource_map[resource_id][0])
[docs] def get_download_fun(self, resource_id):
"""Return the download function for the given resource.
Parameters
----------
resource_id : str
The ID of the resource.
Returns
-------
function
The download function for the given resource.
"""
return self.resource_map[resource_id][1]
[docs] def has_resource_file(self, resource_id):
"""Return True if the resource file exists for the given ID.
Parameters
----------
resource_id : str
The ID of the resource.
Returns
-------
bool
True if the resource file exists, false otherwise.
"""
fname = self.get_resource_file(resource_id)
return os.path.exists(fname)
[docs] def download_resource_file(self, resource_id, cached=True):
"""Download the resource file corresponding to the given ID.
Parameters
----------
resource_id : str
The ID of the resource.
cached : Optional[bool]
If True, the download is a pre-processed file from S3, otherwise
the download is obtained and processed from the primary source.
Default: True
"""
download_fun = self.get_download_fun(resource_id)
fname = self.get_resource_file(resource_id)
logger.info('Downloading \'%s\' resource file into %s%s.' %
(resource_id, fname, ' from cache' if cached else ''))
download_fun(fname, cached=cached)
[docs] def get_create_resource_file(self, resource_id, cached=True):
"""Return the path to the resource file, download if it doesn't exist.
Parameters
----------
resource_id : str
The ID of the resource.
cached : Optional[bool]
If True, the download is a pre-processed file from S3, otherwise
the download is obtained and processed from the primary source.
Default: True
Returns
-------
str
The path to the resource file.
"""
if not self.has_resource_file(resource_id):
logger.info(('Could not access \'%s\' resource'
' file, will download.') % resource_id)
self.download_resource_file(resource_id, cached)
return self.get_resource_file(resource_id)
[docs] def get_resource_ids(self):
"""Return a list of all the resource IDs managed by this manager."""
return list(self.resource_map.keys())
resource_manager = ResourceManager(RESOURCE_MAP)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# By default we use the cache
parser.add_argument('--uncached', action='store_true')
# By default we use get_create which doesn't do anything if the resource
# already exists. With the download flag, we force re-download.
parser.add_argument('--download', action='store_true')
args = parser.parse_args()
logger.info(args)
resource_ids = resource_manager.get_resource_ids()
for resource_id in resource_ids:
if not args.download:
resource_manager.get_create_resource_file(resource_id,
cached=(not
args.uncached))
else:
resource_manager.download_resource_file(resource_id,
cached=(not args.uncached))