diff --git a/pandasaurus_cxg/graph_generator/graph_generator.py b/pandasaurus_cxg/graph_generator/graph_generator.py index 818af1a..7d77469 100644 --- a/pandasaurus_cxg/graph_generator/graph_generator.py +++ b/pandasaurus_cxg/graph_generator/graph_generator.py @@ -1,3 +1,4 @@ +import json import textwrap import uuid from enum import Enum @@ -120,9 +121,7 @@ def generate_rdf_graph(self, merge: bool = False): ) ) - self.graph.add( - (dataset_class, URIRef(self.ns[ncname_safe(key)]), Literal(value)) - ) + self.graph.add((dataset_class, URIRef(self.ns[ncname_safe(key)]), Literal(value))) has_source = URIRef(HAS_SOURCE["iri"]) self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"]))) @@ -171,10 +170,25 @@ def generate_rdf_graph(self, merge: bool = False): resource = self.ns[_uuid] self.graph.add((resource, RDF.type, cell_set_class)) self.graph.add((resource, has_source, dataset_class)) + # Collect author_cell_type keys here + payload = {} for k, v in inner_dict.items(): if k in {"subcluster_of", "cluster_matches"}: continue - self.graph.add((resource, self.ns[ncname_safe(k)], Literal(v))) + elif k in {"cell_count", "cell_type"}: + self.graph.add((resource, self.ns[k], Literal(v))) + else: + # Author annotations go into the JSON payload + payload[k] = v + # Add one JSON literal for all author annotations + if payload: + self.graph.add( + ( + resource, + self.ns.author_cell_type_json, + Literal(json.dumps(payload, ensure_ascii=False)), + ) + ) # add relationship between each resource based on their predicate in the co_annotation_report subcluster = URIRef(SUBCLUSTER_OF.get("iri")) diff --git a/pandasaurus_cxg/graph_generator/graph_generator_utils.py b/pandasaurus_cxg/graph_generator/graph_generator_utils.py index 739e295..d9e487c 100644 --- a/pandasaurus_cxg/graph_generator/graph_generator_utils.py +++ b/pandasaurus_cxg/graph_generator/graph_generator_utils.py @@ -147,8 +147,8 @@ def ncname_safe(term: str) -> str: """ term = term.replace(" ", "_") - term = re.sub(r'^[^A-Za-z_]+', '', term) - return re.sub(r'[^A-Za-z0-9_\-\.]', '_', term) + term = re.sub(r"^[^A-Za-z_]+", "", term) + return re.sub(r"[^A-Za-z0-9_\-\.]", "_", term) def parse_citation_field_into_dict(value: str) -> Dict[str, str]: diff --git a/test/graph_generator/test_graph_generator.py b/test/graph_generator/test_graph_generator.py index fa6c149..b94cef4 100644 --- a/test/graph_generator/test_graph_generator.py +++ b/test/graph_generator/test_graph_generator.py @@ -199,7 +199,7 @@ def test_generate_rdf_graph_with_merge(graph_generator_instance_for_kidney, expe ) == expected_stable_ids ) - assert len(graph_generator.graph) == 747 + assert len(graph_generator.graph) == 584 assert ( len([[s, p, o] for s, p, o in graph_generator.graph.triples((None, RDF.type, None))]) == 146 ) @@ -216,7 +216,7 @@ def test_generate_rdf_graph_with_merge(graph_generator_instance_for_kidney, expe ) ] ) - == 90 + == 77 ) assert ( len( @@ -241,7 +241,7 @@ def test_generate_rdf_graph_with_merge(graph_generator_instance_for_kidney, expe def test_generate_rdf_graph_without_merge(graph_generator_instance_for_kidney): graph_generator = graph_generator_instance_for_kidney graph_generator.generate_rdf_graph() - assert len(graph_generator.graph) == 2177 + assert len(graph_generator.graph) == 1398 assert ( len([[s, p, o] for s, p, o in graph_generator.graph.triples((None, RDF.type, None))]) == 312 ) @@ -270,11 +270,11 @@ def test_enrich_rdf_graph_with_merge(graph_generator_instance_for_kidney): graph_generator = graph_generator_instance_for_kidney graph_generator.generate_rdf_graph(merge=True) - assert len(graph_generator.graph) == 747 + assert len(graph_generator.graph) == 584 graph_generator.enrich_rdf_graph() - assert len(graph_generator.graph) == 1242 + assert len(graph_generator.graph) == 1081 assert ( URIRef(CONSIST_OF.get("iri")), RDFS.label, @@ -291,7 +291,7 @@ def test_enrich_rdf_graph_with_merge(graph_generator_instance_for_kidney): if str(s).startswith("http://purl.obolibrary.org/obo/CL_") ] ) - == 531 + == 529 ) @@ -299,11 +299,11 @@ def test_enrich_rdf_graph_without_merge(graph_generator_instance_for_kidney): graph_generator = graph_generator_instance_for_kidney graph_generator.generate_rdf_graph() - assert len(graph_generator.graph) == 2177 + assert len(graph_generator.graph) == 1398 graph_generator.enrich_rdf_graph() - assert len(graph_generator.graph) == 2674 + assert len(graph_generator.graph) == 1895 def test_save_rdf_graph(graph_generator_instance_for_kidney):