Skip to content
2 changes: 1 addition & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Config(object):
KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key")
DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development")
SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_Datasets_pr")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev")
MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates")
SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates")
WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN")
Expand Down
57 changes: 57 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,63 @@ def direct_download_url(path):
resource = response["Body"].read()
return resource

# /scicrunch/: Returns scicrunch results for a given <search> query
@app.route("/scicrunch-dataset/<doi1>/<doi2>")
def sci_doi(doi1,doi2):
doi = doi1 + '/' + doi2
print(doi)
data = create_doi_request(doi)
try:
response = requests.post(
f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}',
json=data)
return response.json()
except requests.exceptions.HTTPError as err:
logging.error(err)
return json.dumps({'error': err})

# /scicrunch-processed/: Returns scicrunch results for a given <search> query
@app.route("/scicrunch-dataset-processed/<doi1>/<doi2>")
def sci_doi_processed(doi1,doi2):
doi = doi1 + '/' + doi2
print(doi)
data = create_doi_request(doi)
try:
response = requests.post(
f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}',
json=data)
return process_kb_results(response.json())
except requests.exceptions.HTTPError as err:
logging.error(err)
return json.dumps({'error': err})

# /scicrunch-query-string/: Returns results for given organ curie. These can be processed by the sidebar
@app.route("/scicrunch-query-string/")
def sci_organ():
fields = request.args.getlist('field')
curie = request.args.get('curie')
# field example: "*organ.curie"
data = {
"size": 20,
"from": 0,
"query": {
"query_string": {
"fields": fields,
"query": curie
}
}
}

try:
response = requests.post(
f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}',
json=data)
return process_kb_results(response.json())
except requests.exceptions.HTTPError as err:
logging.error(err)
return json.dumps({'error': err})



# /search/: Returns scicrunch results for a given <search> query
@app.route("/search/", defaults={'query': ''})
Expand Down
61 changes: 54 additions & 7 deletions app/process_kb_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,23 @@
'csvFiles': ['objects']
}

def create_doi_request(doi):

query = {
"query": {
"bool": {
"must": [{"match_all": {}}],
"should": [],
"filter": {
"term": {
"_id": f'DOI:{doi}'
}
}
}
}
}

return query

# create_facet_query(type): Generates facet search request data for scicrunch given a 'type'; where
# 'type' is either 'species', 'gender', or 'genotype' at this stage.
Expand All @@ -25,7 +42,8 @@ def create_facet_query(type):
type_map = {
'species': ['organisms.primary.species.name.aggregate', 'organisms.sample.species.name.aggregate'],
'gender': ['attributes.subject.sex.value'],
'genotype': ['anatomy.organ.name.aggregate']
'genotype': ['anatomy.organ.name.aggregate'],
'organ': ['anatomy.organ.name.aggregate']
}

data = {
Expand Down Expand Up @@ -69,7 +87,8 @@ def create_filter_request(query, terms, facets, size, start):
type_map = {
'species': ['organisms.primary.species.name.aggregate', 'organisms.sample.species.name'],
'gender': ['attributes.subject.sex.value', 'attributes.sample.sex.value'],
'genotype': ['anatomy.organ.name.aggregate']
'genotype': ['anatomy.organ.name.aggregate'],
'organ': ['anatomy.organ.name.aggregate']
}

# Data structure of a scicrunch search
Expand Down Expand Up @@ -136,7 +155,9 @@ def process_kb_results(results):
for i, hit in enumerate(hits):
attr = get_attributes(attributes, hit)
attr['doi'] = convert_doi_to_url(attr['doi'])
attr['csvFiles'] = find_csv_files(attr['csvFiles'])
objects = attr['csvFiles'] # Have to do this as not all datsets return objects
attr['csvFiles'] = find_csv_files(objects)
attr['scaffolds'] = find_scaffold_json_files(objects)
output.append(attr)
return json.dumps({'numberOfHits': results['hits']['total'], 'results': output})

Expand All @@ -146,11 +167,36 @@ def convert_doi_to_url(doi):
return doi
return doi.replace('DOI:', 'https://doi.org/')

def convert_url_to_doi(doi):
if not doi:
return doi
return doi.replace('https://doi.org/', 'DOI:')


def find_csv_files(obj_list):
if not obj_list:
return obj_list
return [obj for obj in obj_list if obj.get('mimetype', 'none') == 'text/csv']
return [obj for obj in obj_list if obj.get('mimetype', {}).get('name', 'none') == 'text/csv']


def find_scaffold_json_files(obj_list):
if not obj_list:
return obj_list
return [obj for obj in obj_list if obj.get('additional_mimetype', {}).get('name', 'none') == 'inode/vnd.abi.scaffold+file']


attributes = {
'scaffolds': ['scaffolds'],
'samples': ['attributes','sample','subject'],
'name': ['item','name'],
'identifier': ['item', 'identifier'],
'uri': ['distributions', 'current', 'uri'],
'updated': ['dates', 'updated'],
'organs': ['anatomy', 'organ'],
'contributors': ['contributors'],
'doi': ['item', 'curie'],
'csvFiles': ['objects']
}


# get_attributes: Use 'attributes' (defined at top of this document) to step through the large scicrunch result dict
Expand All @@ -160,11 +206,12 @@ def get_attributes(attributes, dataset):
for k, attr in attributes.items():
subset = dataset['_source'] # set our subest to the full dataset result
key_attr = False
for key in attr:
for n, key in enumerate(attr): # step through attributes
if isinstance(subset, dict):
if key in subset.keys():
if key in subset.keys(): # continue if keys are found
subset = subset[key]
key_attr = subset
if n+1 is len(attr): # if we made it to the end, save this subset
key_attr = subset
found_attr[k] = key_attr
return found_attr

5 changes: 5 additions & 0 deletions tests/test_scicrunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ def test_scicrunch_keys(client):
assert r.status_code == 200
assert 'numberOfHits' in json.loads(r.data).keys()

def test_scicrunch_dataset_doi(client):
r = client.get('/scicrunch-dataset/DOI%3A10.26275%2Fpzek-91wx')
assert json.loads(r.data)['hits']['hits'][0]['_id'] == "DOI:10.26275/pzek-91wx"


def test_scicrunch_search(client):
r = client.get('/search/heart')
assert r.status_code == 200
Expand Down