diff --git a/graphgen/bases/datatypes.py b/graphgen/bases/datatypes.py index 58dbda2e..cb3be345 100644 --- a/graphgen/bases/datatypes.py +++ b/graphgen/bases/datatypes.py @@ -15,7 +15,7 @@ def from_dict(key: str, data: dict) -> "Chunk": return Chunk( id=key, content=data.get("content", ""), - type=data.get("type", "unknown"), + type=data.get("type", "text"), metadata={k: v for k, v in data.items() if k != "content"}, ) diff --git a/graphgen/configs/protein_qa_config.yaml b/graphgen/configs/protein_qa_config.yaml new file mode 100644 index 00000000..d98f47c1 --- /dev/null +++ b/graphgen/configs/protein_qa_config.yaml @@ -0,0 +1,19 @@ +read: + input_file: resources/input_examples/protein_qa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + anchor_type: protein # get protein information from chunks +split: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting +search: # web search configuration + enabled: false # whether to enable web search + search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia +quiz_and_judge: # quiz and test whether the LLM masters the knowledge points + enabled: false +partition: # graph partition configuration + method: anchor_bfs # partition method + method_params: + anchor_type: protein # node type to select anchor nodes + max_units_per_community: 10 # atomic partition, one node or edge per community +generate: + mode: protein_qa # atomic, aggregated, multi_hop, cot, vqa + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index e8258829..4f9a8d8d 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -16,8 +16,7 @@ Tokenizer, ) from graphgen.operators import ( - build_mm_kg, - build_text_kg, + build_kg, chunk_documents, generate_qas, init_llm, @@ -96,109 +95,46 @@ async def insert(self, read_config: Dict, split_config: Dict): new_docs = {compute_mm_hash(doc, prefix="doc-"): doc for doc in data} _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys())) new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} - new_text_docs = {k: v for k, v in new_docs.items() if v.get("type") == "text"} - new_mm_docs = {k: v for k, v in new_docs.items() if v.get("type") != "text"} - - await self.full_docs_storage.upsert(new_docs) - - async def _insert_text_docs(text_docs): - if len(text_docs) == 0: - logger.warning("All text docs are already in the storage") - return - logger.info("[New Docs] inserting %d text docs", len(text_docs)) - # Step 2.1: Split chunks and filter existing ones - inserting_chunks = await chunk_documents( - text_docs, - split_config["chunk_size"], - split_config["chunk_overlap"], - self.tokenizer_instance, - self.progress_bar, - ) - _add_chunk_keys = await self.chunks_storage.filter_keys( - list(inserting_chunks.keys()) - ) - inserting_chunks = { - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys - } - - if len(inserting_chunks) == 0: - logger.warning("All text chunks are already in the storage") - return - - logger.info("[New Chunks] inserting %d text chunks", len(inserting_chunks)) - await self.chunks_storage.upsert(inserting_chunks) - - # Step 2.2: Extract entities and relations from text chunks - logger.info("[Text Entity and Relation Extraction] processing ...") - _add_entities_and_relations = await build_text_kg( - llm_client=self.synthesizer_llm_client, - kg_instance=self.graph_storage, - chunks=[ - Chunk(id=k, content=v["content"], type="text") - for k, v in inserting_chunks.items() - ], - progress_bar=self.progress_bar, - ) - if not _add_entities_and_relations: - logger.warning("No entities or relations extracted from text chunks") - return - - await self._insert_done() - return _add_entities_and_relations - - async def _insert_multi_modal_docs(mm_docs): - if len(mm_docs) == 0: - logger.warning("No multi-modal documents to insert") - return - - logger.info("[New Docs] inserting %d multi-modal docs", len(mm_docs)) - - # Step 3.1: Transform multi-modal documents into chunks and filter existing ones - inserting_chunks = await chunk_documents( - mm_docs, - split_config["chunk_size"], - split_config["chunk_overlap"], - self.tokenizer_instance, - self.progress_bar, - ) + if len(new_docs) == 0: + logger.warning("All documents are already in the storage") + return - _add_chunk_keys = await self.chunks_storage.filter_keys( - list(inserting_chunks.keys()) - ) - inserting_chunks = { - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys - } + inserting_chunks = await chunk_documents( + new_docs, + split_config["chunk_size"], + split_config["chunk_overlap"], + self.tokenizer_instance, + self.progress_bar, + ) - if len(inserting_chunks) == 0: - logger.warning("All multi-modal chunks are already in the storage") - return + _add_chunk_keys = await self.chunks_storage.filter_keys( + list(inserting_chunks.keys()) + ) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } - logger.info( - "[New Chunks] inserting %d multimodal chunks", len(inserting_chunks) - ) - await self.chunks_storage.upsert(inserting_chunks) - - # Step 3.2: Extract multi-modal entities and relations from chunks - logger.info("[Multi-modal Entity and Relation Extraction] processing ...") - _add_entities_and_relations = await build_mm_kg( - llm_client=self.synthesizer_llm_client, - kg_instance=self.graph_storage, - chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()], - progress_bar=self.progress_bar, - ) - if not _add_entities_and_relations: - logger.warning( - "No entities or relations extracted from multi-modal chunks" - ) - return - await self._insert_done() - return _add_entities_and_relations - - # Step 2: Insert text documents - await _insert_text_docs(new_text_docs) - # Step 3: Insert multi-modal documents - await _insert_multi_modal_docs(new_mm_docs) + if len(inserting_chunks) == 0: + logger.warning("All chunks are already in the storage") + return + + logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks)) + await self.chunks_storage.upsert(inserting_chunks) + + _add_entities_and_relations = await build_kg( + llm_client=self.synthesizer_llm_client, + kg_instance=self.graph_storage, + chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()], + anchor_type=read_config.get("anchor_type", None), + progress_bar=self.progress_bar, + ) + if not _add_entities_and_relations: + logger.warning("No entities or relations extracted from text chunks") + return + + await self._insert_done() + return _add_entities_and_relations async def _insert_done(self): tasks = [] diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 08694166..7ae53799 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -6,7 +6,7 @@ MultiHopGenerator, VQAGenerator, ) -from .kg_builder import LightRAGKGBuilder, MMKGBuilder +from .kg_builder import LightRAGKGBuilder, MMKGBuilder, MOKGBuilder from .llm import HTTPClient, OllamaClient, OpenAIClient from .partitioner import ( AnchorBFSPartitioner, diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py index 1e7e2c44..330fd44f 100644 --- a/graphgen/models/kg_builder/__init__.py +++ b/graphgen/models/kg_builder/__init__.py @@ -1,2 +1,3 @@ from .light_rag_kg_builder import LightRAGKGBuilder from .mm_kg_builder import MMKGBuilder +from .mo_kg_builder import MOKGBuilder diff --git a/graphgen/models/kg_builder/mo_kg_builder.py b/graphgen/models/kg_builder/mo_kg_builder.py new file mode 100644 index 00000000..da466164 --- /dev/null +++ b/graphgen/models/kg_builder/mo_kg_builder.py @@ -0,0 +1,100 @@ +import re +from collections import defaultdict +from typing import Dict, List, Tuple + +from graphgen.bases import Chunk +from graphgen.templates import PROTEIN_KG_EXTRACTION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + split_string_by_multi_markers, +) + +from .light_rag_kg_builder import LightRAGKGBuilder + + +class MOKGBuilder(LightRAGKGBuilder): + @staticmethod + async def scan_document_for_schema( + chunk: Chunk, schema: Dict[str, List[str]] + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Scan the document chunk to extract entities and relationships based on the provided schema. + :param chunk: The document chunk to be scanned. + :param schema: A dictionary defining the entities and relationships to be extracted. + :return: A tuple containing two dictionaries - one for entities and one for relationships. + """ + # TODO: use hard-coded PROTEIN_KG_EXTRACTION_PROMPT for protein chunks, + # support schema for other chunk types later + print(chunk.id, schema) + return {}, {} + + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Multi-Omics Knowledge Graph Builder + Step1: Extract and output a JSON object containing protein information from the given chunk. + Step2: Get more details about the protein by querying external databases if necessary. + Step3: Construct entities and relationships for the protein knowledge graph. + Step4: Return the entities and relationships. + :param chunk + :return: Tuple containing entities and relationships. + """ + # TODO: Implement the multi-omics KG extraction logic here + chunk_id = chunk.id + chunk_type = chunk.type # genome | protein | ... + metadata = chunk.metadata + + # choose different extraction strategies based on chunk type + if chunk_type == "protein": + protein_caption = "" + for key, value in metadata["protein_caption"].items(): + protein_caption += f"{key}: {value}\n" + logger.debug("Protein chunk caption: %s", protein_caption) + + language = detect_main_language(protein_caption) + prompt_template = PROTEIN_KG_EXTRACTION_PROMPT[language].format( + **PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"], + input_text=protein_caption, + ) + result = await self.llm_client.generate_answer(prompt_template) + logger.debug("Protein chunk extraction result: %s", result) + + # parse the result + records = split_string_by_multi_markers( + result, + [ + PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction( + attributes, chunk_id + ) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) diff --git a/graphgen/models/search/db/uniprot_search.py b/graphgen/models/search/db/uniprot_search.py index daf42246..6bbf3f84 100644 --- a/graphgen/models/search/db/uniprot_search.py +++ b/graphgen/models/search/db/uniprot_search.py @@ -1,61 +1,117 @@ -import requests -from fastapi import HTTPException +from io import StringIO +from typing import Dict, Optional -from graphgen.utils import logger +from Bio import ExPASy, SeqIO, SwissProt, UniProt +from Bio.Blast import NCBIWWW, NCBIXML -UNIPROT_BASE = "https://rest.uniprot.org/uniprotkb/search" +from graphgen.utils import logger class UniProtSearch: """ UniProt Search client to search with UniProt. 1) Get the protein by accession number. - 2) Search with keywords or protein names. + 2) Search with keywords or protein names (fuzzy search). """ - def get_entry(self, accession: str) -> dict: + def get_by_accession(self, accession: str) -> Optional[dict]: + try: + handle = ExPASy.get_sprot_raw(accession) + record = SwissProt.read(handle) + handle.close() + return self._swissprot_to_dict(record) + except Exception as exc: # pylint: disable=broad-except + logger.error("Accession %s not found: %s", accession, exc) + return None + + @staticmethod + def _swissprot_to_dict(record: SwissProt.Record) -> dict: + """error + Convert a SwissProt.Record to a dictionary. """ - Get the UniProt entry by accession number(e.g., P04637). + functions = [] + for line in record.comments: + if line.startswith("FUNCTION:"): + functions.append(line[9:].strip()) + + return { + "molecule_type": "protein", + "database": "UniProt", + "id": record.accessions[0], + "entry_name": record.entry_name, + "gene_names": record.gene_name, + "protein_name": record.description.split(";")[0].split("=")[-1], + "organism": record.organism.split(" (")[0], + "sequence": str(record.sequence), + "function": functions, + "url": f"https://www.uniprot.org/uniprot/{record.accessions[0]}", + } + + def get_best_hit(self, keyword: str) -> Optional[Dict]: """ - url = f"{UNIPROT_BASE}/{accession}.json" - return self._safe_get(url).json() - - def search( - self, - query: str, - *, - size: int = 10, - cursor: str = None, - fields: list[str] = None, - ) -> dict: + Search UniProt with a keyword and return the best hit. + :param keyword: The search keyword. + :return: A dictionary containing the best hit information or None if not found. """ - Search UniProt with a query string. - :param query: The search query. - :param size: The number of results to return. - :param cursor: The cursor for pagination. - :param fields: The fields to return in the response. - :return: A dictionary containing the search results. + if not keyword.strip(): + return None + + try: + iterator = UniProt.search(keyword, fields=None, batch_size=1) + hit = next(iterator, None) + if hit is None: + return None + return self.get_by_accession(hit["primaryAccession"]) + + except Exception as e: # pylint: disable=broad-except + logger.error("Keyword %s not found: %s", keyword, e) + return None + + def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: """ - params = { - "query": query, - "size": size, - } - if cursor: - params["cursor"] = cursor - if fields: - params["fields"] = ",".join(fields) - url = UNIPROT_BASE - return self._safe_get(url, params=params).json() + Search UniProt with a FASTA sequence and return the best hit. + :param fasta_sequence: The FASTA sequence. + :param threshold: E-value threshold for BLAST search. + :return: A dictionary containing the best hit information or None if not found. + """ + try: + if fasta_sequence.startswith(">"): + seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq) + else: + seq = fasta_sequence.strip() + except Exception as e: # pylint: disable=broad-except + logger.error("Invalid FASTA sequence: %s", e) + return None - @staticmethod - def _safe_get(url: str, params: dict = None) -> requests.Response: - r = requests.get( - url, - params=params, - headers={"Accept": "application/json"}, - timeout=10, - ) - if not r.ok: - logger.error("Search engine error: %s", r.text) - raise HTTPException(r.status_code, "Search engine error.") - return r + if not seq: + logger.error("Empty FASTA sequence provided.") + return None + + # UniProtKB/Swiss-Prot BLAST API + try: + result_handle = NCBIWWW.qblast( + program="blastp", + database="swissprot", + sequence=seq, + hitlist_size=1, + expect=threshold, + ) + blast_record = NCBIXML.read(result_handle) + except Exception as e: # pylint: disable=broad-except + logger.error("BLAST search failed: %s", e) + return None + + if not blast_record.alignments: + logger.info("No BLAST hits found for the given sequence.") + return None + + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + logger.info("No BLAST hits below the threshold E-value.") + return None + hit_id = best_alignment.hit_id + + # like sp|P01308.1|INS_HUMAN + accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id + return self.get_by_accession(accession) diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index 3e8e7ba9..ace334d6 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -1,4 +1,4 @@ -from .build_kg import build_mm_kg, build_text_kg +from .build_kg import build_kg from .generate import generate_qas from .init import init_llm from .judge import judge_statement diff --git a/graphgen/operators/build_kg/__init__.py b/graphgen/operators/build_kg/__init__.py index 70dac51b..18766fe6 100644 --- a/graphgen/operators/build_kg/__init__.py +++ b/graphgen/operators/build_kg/__init__.py @@ -1,2 +1 @@ -from .build_mm_kg import build_mm_kg -from .build_text_kg import build_text_kg +from .build_kg import build_kg diff --git a/graphgen/operators/build_kg/build_kg.py b/graphgen/operators/build_kg/build_kg.py new file mode 100644 index 00000000..89458268 --- /dev/null +++ b/graphgen/operators/build_kg/build_kg.py @@ -0,0 +1,73 @@ +from typing import List, Optional + +import gradio as gr + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.utils import logger + +from .build_mm_kg import build_mm_kg +from .build_mo_kg import build_mo_kg +from .build_text_kg import build_text_kg + + +async def build_kg( + llm_client: BaseLLMWrapper, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], + anchor_type: Optional[str] = None, + progress_bar: gr.Progress = None, +): + """ + Build knowledge graph (KG) and merge into kg_instance + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance + :param chunks + :param anchor_type: get this type of information from chunks + :param progress_bar: Gradio progress bar to show the progress of the extraction + :return: + """ + + text_chunks = [chunk for chunk in chunks if chunk.type == "text"] + mm_chunks = [ + chunk + for chunk in chunks + if chunk.type in ("image", "video", "table", "formula") + ] + mo_chunks = [chunk for chunk in chunks if chunk.type in ("genome", "protein")] + + if len(text_chunks) == 0: + logger.info("All text chunks are already in the storage") + else: + logger.info("[Text Entity and Relation Extraction] processing ...") + await build_text_kg( + llm_client=llm_client, + kg_instance=kg_instance, + chunks=text_chunks, + progress_bar=progress_bar, + ) + + if len(mm_chunks) == 0: + logger.info("All multi-modal chunks are already in the storage") + else: + logger.info("[Multi-modal Entity and Relation Extraction] processing ...") + await build_mm_kg( + llm_client=llm_client, + kg_instance=kg_instance, + chunks=mm_chunks, + progress_bar=progress_bar, + ) + + if len(mo_chunks) == 0: + logger.info("All multi-omics chunks are already in the storage") + else: + logger.info("[Multi-omics Entity and Relation Extraction] processing ...") + await build_mo_kg( + llm_client=llm_client, + kg_instance=kg_instance, + chunks=mo_chunks, + progress_bar=progress_bar, + ) + + return kg_instance diff --git a/graphgen/operators/build_kg/build_mo_kg.py b/graphgen/operators/build_kg/build_mo_kg.py new file mode 100644 index 00000000..046b1739 --- /dev/null +++ b/graphgen/operators/build_kg/build_mo_kg.py @@ -0,0 +1,58 @@ +from collections import defaultdict +from typing import List + +import gradio as gr + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import MOKGBuilder +from graphgen.utils import run_concurrent + + +async def build_mo_kg( + llm_client: BaseLLMWrapper, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], + progress_bar: gr.Progress = None, +): + """ + Build multi-omics KG and merge into kg_instance. (Multi-Omics: genomics, proteomics, metabolomics, etc.) + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance + :param chunks + :param progress_bar: Gradio progress bar to show the progress of the extraction + :return: + """ + + mo_builder = MOKGBuilder(llm_client=llm_client) + + results = await run_concurrent( + mo_builder.extract, + chunks, + desc="[2/4] Extracting entities and relationships from multi-omics chunks", + unit="chunk", + progress_bar=progress_bar, + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + for n, e in results: + for k, v in n.items(): + nodes[k].extend(v) + for k, v in e.items(): + edges[tuple(sorted(k))].extend(v) + + await run_concurrent( + lambda kv: mo_builder.merge_nodes(kv, kg_instance=kg_instance), + list(nodes.items()), + desc="Inserting entities into storage", + ) + + await run_concurrent( + lambda kv: mo_builder.merge_edges(kv, kg_instance=kg_instance), + list(edges.items()), + desc="Inserting relationships into storage", + ) + + return kg_instance diff --git a/graphgen/operators/search/db/__init__.py b/graphgen/operators/search/db/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/graphgen/operators/search/db/search_uniprot.py b/graphgen/operators/search/db/search_uniprot.py deleted file mode 100644 index e69de29b..00000000 diff --git a/graphgen/operators/search/multi_omics_search.py b/graphgen/operators/search/multi_omics_search.py new file mode 100644 index 00000000..fbe10f06 --- /dev/null +++ b/graphgen/operators/search/multi_omics_search.py @@ -0,0 +1,29 @@ +import re +from typing import Dict, Optional + +from graphgen.models import UniProtSearch + + +def _fetch_uniprot(entry: str) -> Optional[Dict]: + entry = entry.strip() + client = UniProtSearch() + + # 1. first try accession search + if re.fullmatch( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry + ): + return client.get_by_accession(entry) + + # 2. then try keyword search + return client.get_best_hit(entry) + + +def multi_omics_search(entry: str) -> Dict: + """ + Multi-omics search function that tries to fetch protein/gene information. + """ + # TODO: Extend this function to include more omics databases as needed. + result = _fetch_uniprot(entry) + if result: + return {"input": entry, "uniprot": result} + return {"input": entry, "uniprot": None} diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py index ea28c4d0..d9040089 100644 --- a/graphgen/templates/__init__.py +++ b/graphgen/templates/__init__.py @@ -1,3 +1,4 @@ +from .anchor import PROTEIN_ANCHOR_PROMPT from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT from .generation import ( @@ -5,9 +6,15 @@ ATOMIC_GENERATION_PROMPT, COT_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT, + PROTEIN_QA_GENERATION_PROMPT, VQA_GENERATION_PROMPT, ) -from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT +from .kg import ( + KG_EXTRACTION_PROMPT, + KG_SUMMARIZATION_PROMPT, + MMKG_EXTRACTION_PROMPT, + PROTEIN_KG_EXTRACTION_PROMPT, +) from .question_generation import QUESTION_GENERATION_PROMPT from .search_judgement import SEARCH_JUDGEMENT_PROMPT from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT diff --git a/graphgen/templates/anchor/__init__.py b/graphgen/templates/anchor/__init__.py new file mode 100644 index 00000000..37279413 --- /dev/null +++ b/graphgen/templates/anchor/__init__.py @@ -0,0 +1 @@ +from .protein_anchor import PROTEIN_ANCHOR_PROMPT diff --git a/graphgen/templates/anchor/protein_anchor.py b/graphgen/templates/anchor/protein_anchor.py new file mode 100644 index 00000000..3ea01372 --- /dev/null +++ b/graphgen/templates/anchor/protein_anchor.py @@ -0,0 +1,70 @@ +TEMPLATE_EN = """You are a biomedical entity extraction engine. +Given the text below, output **only** a JSON object that matches the following schema: + +{{ + "Protein accession or ID": string | null, + "Protein Name": string | null, + "Gene Name from literature": string | null, + "Source organism": string | null, + "Primary biological role": string | null, + "Subcellular localization (active site)": string | null, + "Inducible by": string | null, + "Post-translational modifications": string | null, + "Literature_Name": string | null, + "Experimental evidence": string | null, + "Catalytic activity:": string | null, + "Amino acids length:": int | null, + "Protein family:": string | null, + "Protein sequence:": string | null +}} + +Rules: +1. If the field cannot be found, return `null` rather than guessing. +2. Copy the exact sentence or phrase from the text; do not rephrase. +3. For boolean/number fields, convert strictly (e.g., length → integer). +4. Output **only** the JSON object, no additional words. + +Text: +=== +{chunk} +=== +""" + +TEMPLATE_ZH = """你是一个生物医学实体抽取引擎。 +根据以下文本,输出**仅**符合以下模式的JSON对象: + +```json +{ + "蛋白质登录号或ID": string | null, + "蛋白质名称": string | null, + "文献中的基因名称": string | null, + "来源生物体": string | null, + "主要生物学功能": string | null, + "亚细胞定位(活性位点)": string | null, + "诱导物": string | null, + "翻译后修饰": string | null, + "文献名称": string | null, + "实验依据": string | null, + "催化活性": string | null, + "氨基酸长度": int | null, + "蛋白质家族": string | null, + "蛋白质序列": string | null +} +``` + +规则: +1. 如果找不到该字段,返回`null`而不是猜测。 +2. 直接复制文本中的句子或短语;不要改写。 +3. 对于布尔值/数字字段,严格转换(例如,长度→整数)。 +4. 输出**仅**为JSON对象,不要添加其他文字。 + +文本: +=== +{chunk} +=== +""" + +PROTEIN_ANCHOR_PROMPT = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, +} diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py index b58c2b6c..356ef83d 100644 --- a/graphgen/templates/generation/__init__.py +++ b/graphgen/templates/generation/__init__.py @@ -2,4 +2,5 @@ from .atomic_generation import ATOMIC_GENERATION_PROMPT from .cot_generation import COT_GENERATION_PROMPT from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT +from .protein_qa_generation import PROTEIN_QA_GENERATION_PROMPT from .vqa_generation import VQA_GENERATION_PROMPT diff --git a/graphgen/templates/generation/protein_qa_generation.py b/graphgen/templates/generation/protein_qa_generation.py new file mode 100644 index 00000000..d7447be7 --- /dev/null +++ b/graphgen/templates/generation/protein_qa_generation.py @@ -0,0 +1,95 @@ +# pylint: disable=C0301 +PROTEIN_QA_TEMPLATE_EN: str = """You are a senior computational biologist specializing in structural bioinformatics. Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given protein sample described by the provided ENTITIES and RELATIONSHIPS. +Use English as the output language. + +---Objectives--- +Create multiple sets of protein-centric QA pairs that satisfy the following: +1. Only ask about objectively existing facts in the provided data (e.g., residue numbers, secondary-structure elements, binding sites, catalytic residues, domain boundaries, metal ions, experimental method, etc.). Avoid subjective or speculative questions. +2. Ensure that each question has a single, clear and verifiable answer that can be directly confirmed from the given entities/relationships. +3. Questions should cover diverse aspects: sequence, structure, function, interactions, dynamics, thermodynamics, experimental annotations, etc. +4. Avoid repetitive questions; each question must be unique and meaningful. +5. Use concise, unambiguous language; do not invent information beyond the provided data. + +---Instructions--- +1. Carefully analyse the supplied ENTITIES and RELATIONSHIPS to identify: + - macromolecular entities (protein chains, domains, motifs, ligands, ions) + - structural attributes (helices, strands, loops, cis-peptide, disulfide bonds) + - functional annotations (active site, binding site, post-translational modification) + - experimental metadata (PDB ID, resolution, method, temperature, pH) + - causal or sequential relations (e.g., "Mg2+ binding stabilises loop L3") +2. Organise information logically: + - start with sequence/primary structure + - proceed to secondary/tertiary structure + - end with function, mechanism, and experimental context +3. Maintain scientific accuracy and consistent nomenclature (standard residue numbering, atom names, etc.). +4. Review each QA pair to guarantee logical consistency and absence of hallucination. + +################ +-ENTITIES- +################ +{entities} + +################ +-RELATIONSHIPS- +################ +{relationships} +################ +Directly output the generated QA pairs below. Do NOT copy any example questions, and do NOT include extraneous text. + +Question: +Answer: + +Question: +Answer: + +""" + +PROTEIN_QA_TEMPLATE_ZH: str = """你是一位资深的结构生物信息学计算生物学家。你的任务是根据下述提供的实体与关系,为给定的蛋白质样本生成逻辑连贯、可验证、无幻觉的中英双语问答对(这里仅输出中文)。 +使用中文作为输出语言。 + +---目标--- +创建多组以蛋白质为中心的问答对,满足: +1. 仅询问数据中客观存在的事实(如残基编号、二级结构元件、结合位点、催化残基、结构域边界、金属离子、实验方法等),避免主观或推测性问题。 +2. 每个问题必须有单一、明确且可直接验证的答案,答案必须能从给定实体/关系中直接确认。 +3. 问题需覆盖:序列、结构、功能、相互作用、动力学、热力学、实验注释等多个维度,确保多样性与全面性。 +4. 避免重复提问,每个问题都独特且有意义。 +5. 语言简洁、无歧义,严禁编造超出给定数据的信息。 + +---说明--- +1. 仔细分析提供的实体与关系,识别: + - 大分子实体(蛋白链、结构域、模体、配体、离子) + - 结构属性(螺旋、折叠、环区、顺式肽键、二硫键) + - 功能注释(活性位点、结合位点、翻译后修饰) + - 实验元数据(PDB 编号、分辨率、方法、温度、pH) + - 因果或顺序关系(如“Mg²⁺ 结合稳定了环区 L3”) +2. 按逻辑顺序组织信息: + - 从序列/一级结构入手 + - 再到二级/三级结构 + - 最后到功能、机制及实验背景 +3. 保持科学准确性,使用统一命名规范(标准残基编号、原子名等)。 +4. 检查每对问答,确保逻辑一致且无幻觉。 + +################ +-实体- +################ +{entities} + +################ +-关系- +################ +{relationships} +################ +请直接在下方输出生成的问答对,不要复制任何示例,不要输出无关内容。 + +问题: <问题1> +答案: <答案1> + +问题: <问题2> +答案: <答案2> + +""" + +PROTEIN_QA_GENERATION_PROMPT = { + "en": PROTEIN_QA_TEMPLATE_EN, + "zh": PROTEIN_QA_TEMPLATE_ZH, +} diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py index ea865ce6..d61fdc0b 100644 --- a/graphgen/templates/kg/__init__.py +++ b/graphgen/templates/kg/__init__.py @@ -1,3 +1,4 @@ from .kg_extraction import KG_EXTRACTION_PROMPT from .kg_summarization import KG_SUMMARIZATION_PROMPT from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT +from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT diff --git a/graphgen/templates/kg/protein_kg_extraction.py b/graphgen/templates/kg/protein_kg_extraction.py new file mode 100644 index 00000000..f67917f7 --- /dev/null +++ b/graphgen/templates/kg/protein_kg_extraction.py @@ -0,0 +1,144 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are an expert in protein science and knowledge-graph construction. +Your task is to extract a star-shaped knowledge graph centered on **a single protein** mentioned in the given text. + +-Goal- +Given free-text that discusses one or more proteins, identify: +1. The **central protein** (the first-mentioned protein or the protein explicitly indicated by the user). +2. All entities that are **directly related** to this central protein. +3. All relationships that **directly link** those entities to the central protein (star edges). + +Use English as the output language. + +-Steps- +1. Identify the **central protein entity** and all **directly-related entities** from the text. + For the **central protein**, extract: + - entity_name: use the full name or UniProt ID if given; capitalized. + - entity_type: always `protein`. + - entity_summary: concise description of its main biological role, location, or significance in the text. + + For each **directly-related entity**, extract: + - entity_name: capitalized. + - entity_type: one of [{entity_types}]. + - entity_summary: comprehensive summary of its attributes/activities **as stated in the text**. + + Format each entity as + ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities found in Step 1, list every **(central protein → related entity)** pair that is **clearly related**. + For each pair extract: + - source_entity: the **central protein** name. + - target_entity: the related entity name. + - relationship_summary: short explanation of how the central protein is connected to this entity **according to the text**. + + Format each relationship as + ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Output a single list of all entities and relationships from Steps 1–2, using **{record_delimiter}** as the delimiter. + +4. Finish by printing {completion_delimiter} + +################ +-Example- +################ +Text: +################ +The tumor-suppressor protein p53 is a transcription factor that responds to DNA damage. +Phosphorylation of p53 by ATM kinase at serine-15 enhances its stability. +MDM2, an E3 ubiquitin ligase, negatively regulates p53 via ubiquitination. +################ +Output: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"Tumor-suppressor transcription factor that responds to DNA damage and is regulated by post-translational modifications."){record_delimiter} +("entity"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"protein"{tuple_delimiter}"Protein kinase that phosphorylates p53 at serine-15, thereby enhancing p53 stability."){record_delimiter} +("entity"{tuple_delimiter}"serine-15"{tuple_delimiter}"site"{tuple_delimiter}"Phosphorylation site on p53 that is targeted by ATM kinase."){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3 ubiquitin ligase that negatively regulates p53 through ubiquitination."){record_delimiter} +("entity"{tuple_delimiter}"DNA damage"{tuple_delimiter}"concept"{tuple_delimiter}"Cellular stress signal that activates p53-mediated transcriptional response."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"ATM kinase phosphorylates p53, enhancing its stability."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"serine-15"{tuple_delimiter}"p53 is phosphorylated at serine-15 by ATM kinase."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2 ubiquitinates p53, negatively regulating its activity."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA damage"{tuple_delimiter}"p53 acts as a sensor-transcription factor in response to DNA damage."){completion_delimiter} + +################ +-Real Data- +Entity_types: {entity_types} +Text: {input_text} +################ +Output: +""" + + +TEMPLATE_ZH: str = """您是蛋白质科学与知识图谱构建专家。 +任务:从给定文本中抽取以**一个中心蛋白质**为核心的星型知识图谱。 + +-目标- +文本可能提及一个或多个蛋白质,请: +1. 确定**中心蛋白质**(文本首个提及或用户指定的蛋白)。 +2. 识别所有与中心蛋白**直接相关**的实体。 +3. 仅保留**中心蛋白→相关实体**的直接关系(星型边)。 + +使用中文输出。 + +-步骤- +1. 确定**中心蛋白质实体**及所有**直接相关实体**。 + 对于**中心蛋白质**: + - entity_name:全名或UniProt ID,首字母大写。 + - entity_type:固定为`protein`。 + - entity_summary:简述其在文中的生物学功能、定位或意义。 + + 对于每个**直接相关实体**: + - entity_name:首字母大写。 + - entity_type:可选类型[{entity_types}]。 + - entity_summary:全面总结其在文中与中心蛋白相关的属性/活动。 + + 格式:("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 在步骤1的实体中,列出所有**(中心蛋白→相关实体)**的明显关系对。 + 每对提取: + - source_entity:中心蛋白名称。 + - target_entity:相关实体名称。 + - relationship_summary:简要说明文中二者如何直接关联。 + + 格式:("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 将步骤1–2的所有实体与关系合并为单列表,用**{record_delimiter}**分隔。 + +4. 输出结束标记{completion_delimiter} + +################ +-示例- +################ +文本: +################ +肿瘤抑制蛋白p53是一种转录因子,可响应DNA损伤。ATM激酶在第15位丝氨酸磷酸化p53,增强其稳定性。E3泛素连接酶MDM2通过泛素化负调控p53。 +################ +输出: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"肿瘤抑制转录因子,能感知DNA损伤并通过翻译后修饰被调控。"){record_delimiter} +("entity"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"protein"{tuple_delimiter}"蛋白激酶,在丝氨酸-15位点磷酸化p53,从而提高其稳定性。"){record_delimiter} +("entity"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"site"{tuple_delimiter}"p53上被ATM激酶靶向的磷酸化位点。"){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3泛素连接酶,通过泛素化负调控p53。"){record_delimiter} +("entity"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"concept"{tuple_delimiter}"细胞内应激信号,可激活p53介导的转录应答。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"ATM激酶磷酸化p53,增强其稳定性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"p53在该位点被ATM激酶磷酸化。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2对p53进行泛素化,负向调控其活性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"p53作为感受器-转录因子响应DNA损伤。"){completion_delimiter} + +################ +-真实数据- +实体类型:{entity_types} +文本:{input_text} +################ +输出: +""" + + +PROTEIN_KG_EXTRACTION_PROMPT: dict = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "protein, gene, site, modification, pathway, disease, drug, organism, tissue, cell_line, " + "experiment, technology, concept, location, organization, person, mission, science", + }, +} diff --git a/requirements.txt b/requirements.txt index 82740f03..223dfb52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,37 @@ -tqdm -openai -python-dotenv -numpy -networkx +tqdm~=4.67.1 +openai~=1.99.1 +python-dotenv~=1.0.1 +numpy~=2.2.6 +networkx~=3.4.2 graspologic -tiktoken -pyecharts -wikipedia -tenacity -nltk -jieba -plotly -pandas +tiktoken~=0.8.0 +pyecharts~=2.0.7 +wikipedia~=1.4.0 +tenacity~=9.0.0 +nltk~=3.9.1 +jieba~=0.42.1 +plotly~=5.24.1 +pandas~=2.2.3 gradio>=5.44.1 kaleido -pyyaml -langcodes -requests -fastapi -trafilatura +pyyaml~=6.0.2 +langcodes~=3.4.1 +requests~=2.32.4 +fastapi~=0.115.6 +trafilatura~=2.0.0 +gensim~=4.4.3 -leidenalg -igraph +leidenalg~=0.10.2 +igraph~=0.11.9 python-louvain # For visualization -matplotlib +matplotlib~=3.10.7 + +pytest~=8.4.1 +rich~=13.9.4 +aiohttp~=3.12.9 +biopython~=1.85 +transformers~=4.57.1 +torch~=2.8.0 +setuptools~=75.1.0 \ No newline at end of file diff --git a/resources/input_examples/protein_qa_demo.json b/resources/input_examples/protein_qa_demo.json new file mode 100644 index 00000000..49199f37 --- /dev/null +++ b/resources/input_examples/protein_qa_demo.json @@ -0,0 +1,64 @@ +[ + { + "type": "text", + "content": "The $4 4 - \\mathbf { k D }$ protein, named harpin, was electroeluted from a preparative SDS-polyacrylamide gel (12). At concentrations ${ \\ge } 5 0 0 \\mathbf { n } \\mathbf { M }$ $( \\geq 2 5 ~ | \\mathbf { \\mu } \\mathbf { g } / \\mathbf { m l } )$ , harpin elicited HR in leaves of tobacco (Fig. 2, sectors 6 and " + }, + { + "type": "text", + "content": "Because supernatants from E. amylovora Ea321(pCPP430) or E. coli DH5α (pCPP430) did not elicit HR, we postulated that harpin was not secreted but rather was present in or on the bacteria. Whole bacteria treated with protease failed to elicit HR, whereas bacteria incubated with protease together with $0 . 5 ~ \\mathrm { m M }$ phenylmethylsulfonyl fluoride (PMSF, a protease inhibitor) did (Table 1). Treatment of bacteria with increasing amounts of protease resulted in a decreased ability to elicit HR that correlated with the disappearance of harpin detectable in SDS-polyacrylamide gels (Table 1). After centrifugation of CFEP at $_ { 1 0 5 , 0 0 0 g }$ for 1 hour, most HR-eliciting activity was found in the supernatant. However, when the cell suspension was brought to $3 0 \\mathrm { \\ m M \\ M g C l } _ { 2 }$ ,before sonication, most activity was associated with the sedimented membrane fraction. Gel-permeation chromatography of unheated CFEP also indicated association of the elicitor with a high molecular weight $( > 1 0 ^ { 6 }$ daltons) fraction, probably membrane vesicles (14). Only the membrane fraction of E. amylovora Ea321(pCPP430) reacted with an antiserum raised in response to harpin (15), further supporting the cell-envelope location of harpin (Fig. 4). " + }, + { + "type": "text", + "content": "HR-eliciting activity, harpin was not detected. However, when the protease inhibitor PMSF $( 0 . 5 \\mathrm { \\ m M } )$ was included, the bacteria retained HR-eliciting activity and possessed detectable harpin for more than 2 hours. More protease was required per cell to destroy harpin produced by E. coli $\\mathsf { D H S } \\alpha ( \\mathsf { p C P P } 4 3 0 )$ than by Ea321(pCPP430), suggesting that E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ produces more harpin or degrades it more slowly, or both. " + }, + { + "type": "text", + "content": "The ability of bacterial strains to elicit the HR in intact tobacco leaves is related genetically to their ability to elicit a $\\mathbf { K } ^ { + } / \\mathbf { H } ^ { + }$ exchange reaction (XR) in tobacco cell suspension cultures (TCSCs) (16); both reactions require the hrp gene cluster (17). " + }, + { + "type": "text", + "content": "We tested the ability of harpin to raise the pH of TCSC bathing solution, an indicator of the XR (Fig. 5). Cells of E. amylovora, grown in rich medium and added to TCSCs caused an increase in pH of the bathing solution after 2 to 3 hours. Addition of purified harpin caused an increase in pH within 1 hour. Erwinia amylovora mutant Ea321K49, which did not produce harpin in culture, and strains of E. coli containing mutated hrp gene clusters failed to elicit the XR. " + }, + { + "type": "text", + "content": "Table 1. Protease sensitivity of the HR-eliciting activity of whole cells of E. amylovora Ea321(pCPP430). Cells were grown in LB medium, harvested'by centrifugation, and resuspended in 0.1 volume of $5 m M$ potassium phosphate $( \\mathsf { p H } \\thinspace 6 . 5 )$ containing tetracycline (40 $\\mu { \\sf g } / { \\sf m } 1 )$ . After incubation with protease (Sigma P5147), as indicated, at $\\mathfrak { s } 7 ^ { \\circ } \\mathfrak { C }$ for 5 min, $1 0 0 ~ \\mu !$ of each cell suspension was infiltrated into tobacco leaves. Leaf sector collapse was assayed at 24 hours. At the time of infiltration, portions of protease-treated cell mixtures were iysed, held'in boiling water for 10 min, centrifuged for 10 min at $1 2 . 0 0 0 g .$ and electrophoresed on a $10 \\%$ SDS-polyacrylamide gel to detect harpin. Electrophoresis was done for 2 hours at $1 5 m \\mathsf { A }$ followed by staining with Coomassie blue R-250. Cell-free supernatant, produced from the LB culture, was filter-sterilized and then concentrated with the Centriprep-10 (Amicon, Danvers, Massachusetts). " + }, + { + "type": "table", + "img_path": "resources/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg", + "table_caption": [], + "table_footnote": [], + "table_body": "
Protease per milliterTissue collapseHarpin detected
0++
5μg++
10μg++
20 μgWeak+
40 μg-
80μg
80μg + 0.5 mM PMSF++
Cell-free supernatant
" + }, + { + "type": "text", + "content": "expressed fom pCPP1084 in the T7RNA (20). Insertions of Tn5tac1 in hrpN (21) (Fig. 1) abolished the ability of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ to elicit HR on tobacco or produce harpin detectable on Western blots. Ea321T5, a derivative of E. amylo" + }, + { + "type": "text", + "content": "at $\\pmb { 1 0 0 ^ { \\circ } } \\pmb { \\mathbb { C } }$ for 10 min; 8, CFEP $( 5 ~ \\mu 9 )$ from E. coli DH5a(pCPP430) treated at $1 0 0 ^ { \\circ } \\mathsf { C }$ for 10 min; 9, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321K49 treated at $_ { 1 0 0 ^ { \\circ } \\mathbb { C } }$ for 10 min. Samples from the preparations in lanes 3, 4, 7, and 8 elicited HR in tobacco leaves. Samples were prepared as described (8) and brought to 125 mM tris-HCI $( \\mathsf { p H } 6 . 8 )$ $4 \\%$ SDS, $20 \\%$ glycerol, boiled for 3 min, then electrophoresed through a $10 \\%$ (w/v) polyacrylamide gel with $0 . 1 \\%$ SDS at $1 5 m A$ for 2 hours in a Mighty Small apparatus according to instructions (Hoefer Scientific Instruments, San Francisco, California). The gel was stained with $0 . 0 2 5 \\%$ Coomassie Blue R-250. Low-range prestained molecular weight standards (Bio-Rad 161-0305) were used and calibrated with an unstained protein marker (Bio-Rad 161-0304). Arrow indicates region corresponding to $4 4 \\ k \\mathsf$ " + }, + { + "type": "text", + "content": "DNA sequence data from the $1 . 3 – \\mathbf { k } \\mathbf { b }$ Hind II fragment revealed that hrpN is 1155 base pairs long, and it encodes a 385–amino acid protein (Fig. 1). The 15 $\\mathrm { N H } _ { 2 }$ -terminal residues revealed by amino acid sequencing corresponded to those deduced from the DNA' sequence (Fig. 1). The deduced amino acid sequence of harpin (Fig. 1), which corresponded closely with the analyzed amino acid composition, reveals a glycine-rich protein with a high degree of hydrophilicity. It appears to have an open structure, which may explain its heat stability and sensitivity to proteases. A FASTA search (23) of GenBank for similar proteins revealed similarity only with other glycine-rich proteins, such as several plant cell wall proteins and keratins. " + }, + { + "type": "protein", + "protein_caption": { + "protein name": "harpin", + "gene name from literature": "hrpN", + "source organism": "Erwinia amylovora", + "primary biological role": "elicitor of the plant defense reaction known as the hypersensitive response", + "subcellular localization (active site)": "cell-envelope-associated", + "inducible by": null, + "post-translational modifications": null, + "literature_name": "Harpin, Elicitor of the Hypersensitive Response Produced by the Plant Pathogen Erwinia amylovora", + "experimental evidence": "Harpin caused tobacco leaf lamina to collapse and caused an increase in the pH of bathing solutions of suspension-cultured tobacco cells.", + "gene names from uniprot database:": null, + "catalytic activity:": null, + "amino acids length:": 385, + "protein family:": null, + "protein sequence:": null + } + } +] diff --git a/scripts/generate/generate_protein_qa.sh b/scripts/generate/generate_protein_qa.sh new file mode 100644 index 00000000..2c1a7384 --- /dev/null +++ b/scripts/generate/generate_protein_qa.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.generate \ +--config_file graphgen/configs/protein_qa_config.yaml \ +--output_dir cache/