From c8ada4cdbe396cfac377ec9a173219bf2ffb69be Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 23 Oct 2025 20:52:57 +0800 Subject: [PATCH 1/8] wip: add protein_qa generation --- graphgen/configs/protein_qa_config.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 graphgen/configs/protein_qa_config.yaml diff --git a/graphgen/configs/protein_qa_config.yaml b/graphgen/configs/protein_qa_config.yaml new file mode 100644 index 0000000..69d08b3 --- /dev/null +++ b/graphgen/configs/protein_qa_config.yaml @@ -0,0 +1,18 @@ +read: + input_file: resources/input_examples/protein_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples +split: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting +search: # web search configuration + enabled: false # whether to enable web search + search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia +quiz_and_judge: # quiz and test whether the LLM masters the knowledge points + enabled: false +partition: # graph partition configuration + method: anchor_bfs # partition method + method_params: + anchor_type: protein # node type to select anchor nodes + max_units_per_community: 10 # atomic partition, one node or edge per community +generate: + mode: protein_qa # atomic, aggregated, multi_hop, cot, vqa + data_format: ChatML # Alpaca, Sharegpt, ChatML From 5d5012a65c8c4130ee4d93d5be326dad659bffb0 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Fri, 24 Oct 2025 13:50:01 +0800 Subject: [PATCH 2/8] refactor: refactor build_kg process --- graphgen/graphgen.py | 173 +++++++------------- graphgen/models/kg_builder/mo_kg_builder.py | 23 +++ graphgen/operators/__init__.py | 2 +- graphgen/operators/build_kg/__init__.py | 3 +- graphgen/operators/build_kg/build_kg.py | 52 ++++++ graphgen/operators/build_kg/build_mo_kg.py | 29 ++++ 6 files changed, 161 insertions(+), 121 deletions(-) create mode 100644 graphgen/models/kg_builder/mo_kg_builder.py create mode 100644 graphgen/operators/build_kg/build_kg.py create mode 100644 graphgen/operators/build_kg/build_mo_kg.py diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 8b0559d..2000601 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -1,7 +1,6 @@ import asyncio import os import time -from dataclasses import dataclass from typing import Dict, cast import gradio as gr @@ -16,8 +15,7 @@ Tokenizer, ) from graphgen.operators import ( - build_mm_kg, - build_text_kg, + build_kg, chunk_documents, generate_qas, judge_statement, @@ -31,26 +29,26 @@ sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) -@dataclass class GraphGen: - unique_id: int = int(time.time()) - working_dir: str = os.path.join(sys_path, "cache") - - # llm - tokenizer_instance: Tokenizer = None - synthesizer_llm_client: OpenAIClient = None - trainee_llm_client: OpenAIClient = None - - # webui - progress_bar: gr.Progress = None - - def __post_init__(self): - self.tokenizer_instance: Tokenizer = self.tokenizer_instance or Tokenizer( + def __init__( + self, + unique_id: int = int(time.time()), + working_dir: str = os.path.join(sys_path, "cache"), + tokenizer_instance: Tokenizer = None, + synthesizer_llm_client: OpenAIClient = None, + trainee_llm_client: OpenAIClient = None, + progress_bar: gr.Progress = None, + ): + self.unique_id = unique_id + self.working_dir = working_dir + + # llm + self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer( model_name=os.getenv("TOKENIZER_MODEL") ) self.synthesizer_llm_client: OpenAIClient = ( - self.synthesizer_llm_client + synthesizer_llm_client or OpenAIClient( model_name=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), @@ -59,7 +57,7 @@ def __post_init__(self): ) ) - self.trainee_llm_client: OpenAIClient = self.trainee_llm_client or OpenAIClient( + self.trainee_llm_client: OpenAIClient = trainee_llm_client or OpenAIClient( model_name=os.getenv("TRAINEE_MODEL"), api_key=os.getenv("TRAINEE_API_KEY"), base_url=os.getenv("TRAINEE_BASE_URL"), @@ -86,6 +84,9 @@ def __post_init__(self): namespace="qa", ) + # webui + self.progress_bar: gr.Progress = progress_bar + @async_to_sync_method async def insert(self, read_config: Dict, split_config: Dict): """ @@ -104,109 +105,45 @@ async def insert(self, read_config: Dict, split_config: Dict): new_docs = {compute_mm_hash(doc, prefix="doc-"): doc for doc in data} _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys())) new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} - new_text_docs = {k: v for k, v in new_docs.items() if v.get("type") == "text"} - new_mm_docs = {k: v for k, v in new_docs.items() if v.get("type") != "text"} - - await self.full_docs_storage.upsert(new_docs) - - async def _insert_text_docs(text_docs): - if len(text_docs) == 0: - logger.warning("All text docs are already in the storage") - return - logger.info("[New Docs] inserting %d text docs", len(text_docs)) - # Step 2.1: Split chunks and filter existing ones - inserting_chunks = await chunk_documents( - text_docs, - split_config["chunk_size"], - split_config["chunk_overlap"], - self.tokenizer_instance, - self.progress_bar, - ) - _add_chunk_keys = await self.chunks_storage.filter_keys( - list(inserting_chunks.keys()) - ) - inserting_chunks = { - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys - } - - if len(inserting_chunks) == 0: - logger.warning("All text chunks are already in the storage") - return - - logger.info("[New Chunks] inserting %d text chunks", len(inserting_chunks)) - await self.chunks_storage.upsert(inserting_chunks) - - # Step 2.2: Extract entities and relations from text chunks - logger.info("[Text Entity and Relation Extraction] processing ...") - _add_entities_and_relations = await build_text_kg( - llm_client=self.synthesizer_llm_client, - kg_instance=self.graph_storage, - chunks=[ - Chunk(id=k, content=v["content"], type="text") - for k, v in inserting_chunks.items() - ], - progress_bar=self.progress_bar, - ) - if not _add_entities_and_relations: - logger.warning("No entities or relations extracted from text chunks") - return - - await self._insert_done() - return _add_entities_and_relations - - async def _insert_multi_modal_docs(mm_docs): - if len(mm_docs) == 0: - logger.warning("No multi-modal documents to insert") - return - - logger.info("[New Docs] inserting %d multi-modal docs", len(mm_docs)) - - # Step 3.1: Transform multi-modal documents into chunks and filter existing ones - inserting_chunks = await chunk_documents( - mm_docs, - split_config["chunk_size"], - split_config["chunk_overlap"], - self.tokenizer_instance, - self.progress_bar, - ) + if len(new_docs) == 0: + logger.warning("All documents are already in the storage") + return - _add_chunk_keys = await self.chunks_storage.filter_keys( - list(inserting_chunks.keys()) - ) - inserting_chunks = { - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys - } + inserting_chunks = await chunk_documents( + new_docs, + split_config["chunk_size"], + split_config["chunk_overlap"], + self.tokenizer_instance, + self.progress_bar, + ) - if len(inserting_chunks) == 0: - logger.warning("All multi-modal chunks are already in the storage") - return + _add_chunk_keys = await self.chunks_storage.filter_keys( + list(inserting_chunks.keys()) + ) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } - logger.info( - "[New Chunks] inserting %d multimodal chunks", len(inserting_chunks) - ) - await self.chunks_storage.upsert(inserting_chunks) - - # Step 3.2: Extract multi-modal entities and relations from chunks - logger.info("[Multi-modal Entity and Relation Extraction] processing ...") - _add_entities_and_relations = await build_mm_kg( - llm_client=self.synthesizer_llm_client, - kg_instance=self.graph_storage, - chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()], - progress_bar=self.progress_bar, - ) - if not _add_entities_and_relations: - logger.warning( - "No entities or relations extracted from multi-modal chunks" - ) - return - await self._insert_done() - return _add_entities_and_relations - - # Step 2: Insert text documents - await _insert_text_docs(new_text_docs) - # Step 3: Insert multi-modal documents - await _insert_multi_modal_docs(new_mm_docs) + if len(inserting_chunks) == 0: + logger.warning("All chunks are already in the storage") + return + + logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks)) + await self.chunks_storage.upsert(inserting_chunks) + + _add_entities_and_relations = await build_kg( + llm_client=self.synthesizer_llm_client, + kg_instance=self.graph_storage, + chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()], + progress_bar=self.progress_bar, + ) + if not _add_entities_and_relations: + logger.warning("No entities or relations extracted from text chunks") + return + + await self._insert_done() + return _add_entities_and_relations async def _insert_done(self): tasks = [] diff --git a/graphgen/models/kg_builder/mo_kg_builder.py b/graphgen/models/kg_builder/mo_kg_builder.py new file mode 100644 index 0000000..dfbb44c --- /dev/null +++ b/graphgen/models/kg_builder/mo_kg_builder.py @@ -0,0 +1,23 @@ +from typing import Dict, List, Tuple + +from graphgen.bases import Chunk + +from .light_rag_kg_builder import LightRAGKGBuilder + + +class MOKGBuilder(LightRAGKGBuilder): + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Multi-Omics Knowledge Graph Builder + Step1: Extract and output a JSON object containing protein information from the given chunk. + Step2: Get more details about the protein by querying external databases if necessary. + Step3: Construct entities and relationships for the protein knowledge graph. + Step4: Return the entities and relationships. + :param chunk + :return: Tuple containing entities and relationships. + """ + # TODO: Implement the multi-omics KG extraction logic here + print(chunk) + return {}, {} diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index 2ad37e6..88c3149 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -1,4 +1,4 @@ -from .build_kg import build_mm_kg, build_text_kg +from .build_kg import build_kg from .generate import generate_qas from .judge import judge_statement from .partition import partition_kg diff --git a/graphgen/operators/build_kg/__init__.py b/graphgen/operators/build_kg/__init__.py index 70dac51..18766fe 100644 --- a/graphgen/operators/build_kg/__init__.py +++ b/graphgen/operators/build_kg/__init__.py @@ -1,2 +1 @@ -from .build_mm_kg import build_mm_kg -from .build_text_kg import build_text_kg +from .build_kg import build_kg diff --git a/graphgen/operators/build_kg/build_kg.py b/graphgen/operators/build_kg/build_kg.py new file mode 100644 index 0000000..bce28df --- /dev/null +++ b/graphgen/operators/build_kg/build_kg.py @@ -0,0 +1,52 @@ +from typing import List + +import gradio as gr + +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import OpenAIClient +from graphgen.utils import logger + +from .build_mm_kg import build_mm_kg +from .build_text_kg import build_text_kg + + +async def build_kg( + llm_client: OpenAIClient, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], + progress_bar: gr.Progress = None, +): + """ + Build knowledge graph (KG) and merge into kg_instance + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance + :param chunks + :param progress_bar: Gradio progress bar to show the progress of the extraction + :return: + """ + + text_chunks = [chunk for chunk in chunks if chunk.type == "text"] + mm_chunks = [chunk for chunk in chunks if chunk.type != "text"] + + if len(text_chunks) == 0: + logger.info("All text chunks are already in the storage") + else: + logger.info("[Text Entity and Relation Extraction] processing ...") + await build_text_kg( + llm_client=llm_client, + kg_instance=kg_instance, + chunks=text_chunks, + progress_bar=progress_bar, + ) + if len(mm_chunks) == 0: + logger.info("All multi-modal chunks are already in the storage") + else: + logger.info("[Multi-modal Entity and Relation Extraction] processing ...") + await build_mm_kg( + llm_client=llm_client, + kg_instance=kg_instance, + chunks=mm_chunks, + progress_bar=progress_bar, + ) + return kg_instance diff --git a/graphgen/operators/build_kg/build_mo_kg.py b/graphgen/operators/build_kg/build_mo_kg.py new file mode 100644 index 0000000..4698599 --- /dev/null +++ b/graphgen/operators/build_kg/build_mo_kg.py @@ -0,0 +1,29 @@ +from typing import List + +import gradio as gr + +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import OpenAIClient + + +async def build_mo_kg( + llm_client: OpenAIClient, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], + progress_bar: gr.Progress = None, +): + """ + Build multi-omics KG and merge into kg_instance. (Multi-Omics: genomics, proteomics, metabolomics, etc.) + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance + :param chunks + :param progress_bar: Gradio progress bar to show the progress of the extraction + :return: + """ + # TODO: implement multi-omics KG building logic here + print("llm_client:", llm_client) + print("kg_instance:", kg_instance) + print("chunks:", chunks) + print("progress_bar:", progress_bar) + return kg_instance From e783736f0c2889e4a858cfa74287c68c3e484a2f Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Fri, 24 Oct 2025 19:11:46 +0800 Subject: [PATCH 3/8] wip: add protein qa pipeline --- graphgen/bases/datatypes.py | 2 +- graphgen/configs/protein_qa_config.yaml | 3 +- graphgen/graphgen.py | 21 +- graphgen/operators/build_kg/build_kg.py | 18 +- graphgen/operators/build_kg/build_mo_kg.py | 103 +++++++++- graphgen/operators/build_kg/search_mo.py | 182 ++++++++++++++++++ graphgen/templates/__init__.py | 9 +- graphgen/templates/anchor/__init__.py | 1 + graphgen/templates/anchor/protein_anchor.py | 70 +++++++ graphgen/templates/generation/__init__.py | 1 + .../generation/protein_qa_generation.py | 95 +++++++++ graphgen/templates/kg/__init__.py | 1 + .../templates/kg/protein_kg_extraction.py | 144 ++++++++++++++ resources/input_examples/protein_qa_demo.txt | 53 +++++ scripts/generate/generate_protein_qa.sh | 3 + 15 files changed, 686 insertions(+), 20 deletions(-) create mode 100644 graphgen/operators/build_kg/search_mo.py create mode 100644 graphgen/templates/anchor/__init__.py create mode 100644 graphgen/templates/anchor/protein_anchor.py create mode 100644 graphgen/templates/generation/protein_qa_generation.py create mode 100644 graphgen/templates/kg/protein_kg_extraction.py create mode 100644 resources/input_examples/protein_qa_demo.txt create mode 100644 scripts/generate/generate_protein_qa.sh diff --git a/graphgen/bases/datatypes.py b/graphgen/bases/datatypes.py index 58dbda2..cb3be34 100644 --- a/graphgen/bases/datatypes.py +++ b/graphgen/bases/datatypes.py @@ -15,7 +15,7 @@ def from_dict(key: str, data: dict) -> "Chunk": return Chunk( id=key, content=data.get("content", ""), - type=data.get("type", "unknown"), + type=data.get("type", "text"), metadata={k: v for k, v in data.items() if k != "content"}, ) diff --git a/graphgen/configs/protein_qa_config.yaml b/graphgen/configs/protein_qa_config.yaml index 69d08b3..4f1f237 100644 --- a/graphgen/configs/protein_qa_config.yaml +++ b/graphgen/configs/protein_qa_config.yaml @@ -1,5 +1,6 @@ read: - input_file: resources/input_examples/protein_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + input_file: resources/input_examples/protein_qa_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + anchor_type: protein # get protein information from chunks split: chunk_size: 1024 # chunk size for text splitting chunk_overlap: 100 # chunk overlap for text splitting diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 2000601..a615eee 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -118,16 +118,16 @@ async def insert(self, read_config: Dict, split_config: Dict): self.progress_bar, ) - _add_chunk_keys = await self.chunks_storage.filter_keys( - list(inserting_chunks.keys()) - ) - inserting_chunks = { - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys - } - - if len(inserting_chunks) == 0: - logger.warning("All chunks are already in the storage") - return + # _add_chunk_keys = await self.chunks_storage.filter_keys( + # list(inserting_chunks.keys()) + # ) + # inserting_chunks = { + # k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + # } + # + # if len(inserting_chunks) == 0: + # logger.warning("All chunks are already in the storage") + # return logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks)) await self.chunks_storage.upsert(inserting_chunks) @@ -136,6 +136,7 @@ async def insert(self, read_config: Dict, split_config: Dict): llm_client=self.synthesizer_llm_client, kg_instance=self.graph_storage, chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()], + anchor_type=read_config.get("anchor_type", None), progress_bar=self.progress_bar, ) if not _add_entities_and_relations: diff --git a/graphgen/operators/build_kg/build_kg.py b/graphgen/operators/build_kg/build_kg.py index bce28df..762e798 100644 --- a/graphgen/operators/build_kg/build_kg.py +++ b/graphgen/operators/build_kg/build_kg.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional import gradio as gr @@ -8,6 +8,7 @@ from graphgen.utils import logger from .build_mm_kg import build_mm_kg +from .build_mo_kg import build_mo_kg from .build_text_kg import build_text_kg @@ -15,6 +16,7 @@ async def build_kg( llm_client: OpenAIClient, kg_instance: BaseGraphStorage, chunks: List[Chunk], + anchor_type: Optional[str] = None, progress_bar: gr.Progress = None, ): """ @@ -22,6 +24,7 @@ async def build_kg( :param llm_client: Synthesizer LLM model to extract entities and relationships :param kg_instance :param chunks + :param anchor_type: get this type of information from chunks :param progress_bar: Gradio progress bar to show the progress of the extraction :return: """ @@ -49,4 +52,17 @@ async def build_kg( chunks=mm_chunks, progress_bar=progress_bar, ) + + if anchor_type is not None: + logger.info("Anchoring data based on %s ...", anchor_type) + if anchor_type == "protein": + await build_mo_kg( + llm_client=llm_client, + kg_instance=kg_instance, + chunks=text_chunks, + progress_bar=progress_bar, + ) + else: + logger.error("Anchor type %s is not supported yet.", anchor_type) + return kg_instance diff --git a/graphgen/operators/build_kg/build_mo_kg.py b/graphgen/operators/build_kg/build_mo_kg.py index 4698599..31d479d 100644 --- a/graphgen/operators/build_kg/build_mo_kg.py +++ b/graphgen/operators/build_kg/build_mo_kg.py @@ -1,3 +1,5 @@ +import json +import re from typing import List import gradio as gr @@ -5,6 +7,15 @@ from graphgen.bases.base_storage import BaseGraphStorage from graphgen.bases.datatypes import Chunk from graphgen.models import OpenAIClient +from graphgen.templates import PROTEIN_ANCHOR_PROMPT, PROTEIN_KG_EXTRACTION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + run_concurrent, + split_string_by_multi_markers, +) async def build_mo_kg( @@ -21,9 +32,89 @@ async def build_mo_kg( :param progress_bar: Gradio progress bar to show the progress of the extraction :return: """ - # TODO: implement multi-omics KG building logic here - print("llm_client:", llm_client) - print("kg_instance:", kg_instance) - print("chunks:", chunks) - print("progress_bar:", progress_bar) - return kg_instance + + async def extract_mo_info(chunk: Chunk): + content = chunk.content + language = detect_main_language(content) + prompt = PROTEIN_ANCHOR_PROMPT[language].format(chunk=content) + result = await llm_client.generate_answer(prompt) + try: + json_result = json.loads(result) + return json_result + except json.JSONDecodeError: + logger.warning("Failed to parse JSON from LLM response: %s", result) + return {} + + results = await run_concurrent( + extract_mo_info, + chunks, + desc="Extracting multi-omics anchoring information from chunks", + unit="chunk", + progress_bar=progress_bar, + ) + # Merge results + from collections import defaultdict + + bags = defaultdict(set) + for item in results: + for k, v in item.items(): + if v is not None and str(v).strip(): + bags[k].add(str(v).strip()) + + merged = { + k: " | ".join(sorted(v)) if len(v) > 1 else next(iter(v)) + for k, v in bags.items() + } + + # TODO: search database for more info + # try: + # search_results = await search(merged["Protein accession or ID"]) + # except Exception as e: + # logger.warning("Failed to search for protein info: %s", e) + # search_results = {} + + # 组织成文本 + mo_text = "\n".join([f"{k}: {v}" for k, v in merged.items()]) + lang = detect_main_language(mo_text) + prompt = PROTEIN_KG_EXTRACTION_PROMPT[lang].format( + input_text=mo_text, + **PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"], + ) + kg_output = await llm_client.generate_answer(prompt) + + logger.debug("Image chunk extraction result: %s", kg_output) + + # parse the result + records = split_string_by_multi_markers( + kg_output, + [ + PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + print(records) + raise NotImplementedError + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, "temp") + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction(attributes, "temp") + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) diff --git a/graphgen/operators/build_kg/search_mo.py b/graphgen/operators/build_kg/search_mo.py new file mode 100644 index 0000000..21164e3 --- /dev/null +++ b/graphgen/operators/build_kg/search_mo.py @@ -0,0 +1,182 @@ +# multi_omics_search.py +import logging +import re +import time +from typing import Dict, Optional + +import requests +from Bio import SeqIO +from Bio.Blast import NCBIWWW, NCBIXML +from requests import Session, adapters +from urllib3.util.retry import Retry + +# ---------- 底层工具 ---------- +_SESSION: Optional[Session] = None + + +def _get_session() -> Session: + global _SESSION + if _SESSION is None: + _SESSION = Session() + retry = Retry( + total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504] + ) + _SESSION.mount("https://", adapters.HTTPAdapter(max_retries=retry)) + _SESSION.headers.update({"User-Agent": "MultiOmicsQuery/1.0"}) + return _SESSION + + +# ---------- 数据抓取 ---------- +def _fetch_uniprot(entry: str) -> Optional[Dict]: + entry = entry.strip() + if re.fullmatch( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry + ): + url = f"https://www.uniprot.org/uniprot/{entry}.json" + r = _get_session().get(url, timeout=15) + if r.ok: + return _parse_uniprot(r.json()) + # 模糊搜索 + kw = entry.upper().replace("-", "") + if kw == "INTERLEUKIN6": + kw = "IL6" + r = _get_session().get( + "https://www.uniprot.org/uniprot/", + params={ + "query": f"gene:{kw} OR name:{kw} OR {kw}", + "format": "json", + "limit": 1, + }, + timeout=15, + ) + if not r.ok or not r.json().get("results"): + return None + acc = r.json()["results"][0]["primaryAccession"] + return _fetch_uniprot(acc) # 递归拿详情 + + +def _parse_uniprot(data: dict) -> dict: + return { + "molecule_type": "protein", + "database": "UniProt", + "id": data["primaryAccession"], + "entry_name": data["uniProtkbId"], + "gene_names": [ + g["geneName"]["value"] for g in data.get("genes", []) if "geneName" in g + ], + "protein_name": ( + data["proteinDescription"]["recommendedName"]["fullName"]["value"] + if "recommendedName" in data["proteinDescription"] + else data["proteinDescription"]["submissionNames"][0]["fullName"]["value"] + ), + "organism": data["organism"]["scientificName"], + "sequence": data["sequence"]["value"], + "function": " | ".join( + [ + c["texts"][0]["value"] + for c in data.get("comments", []) + if c["commentType"] == "FUNCTION" + ] + ), + "url": f"https://www.uniprot.org/uniprot/{data['primaryAccession']}", + } + + +def _fetch_ncbi_gene(gene_id: str) -> Optional[Dict]: + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" + params = {"db": "gene", "id": gene_id, "retmode": "json"} + r = requests.get(url, params=params, timeout=15) + if not r.ok: + return None + data = r.json()["result"][gene_id] + return { + "molecule_type": "gene", + "database": "NCBI Gene", + "id": gene_id, + "symbol": data["name"], + "description": data["description"], + "organism": data["organism"]["scientificname"], + "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", + } + + +def _blast(fasta: str, mol_type: str) -> Optional[Dict]: + program = "blastp" if mol_type == "protein" else "blastn" + try: + result = NCBIWWW.qblast(program, "nr", fasta, hitlist_size=1, format_type="XML") + rec = next(NCBIXML.parse(result)) + if not rec.alignments: + return None + best = rec.alignments[0] + hit_id = best.hit_id.split("|")[3] if "|" in best.hit_id else best.hit_id + return { + "molecule_type": mol_type, + "database": "NCBI BLAST", + "hit_id": hit_id, + "hit_title": best.hit_def, + "hit_score": best.hsps[0].score, + "hit_evalue": best.hsps[0].expect, + "url": f"https://www.ncbi.nlm.nih.gov/protein/{hit_id}", + } + except Exception as e: + logging.warning("BLAST 失败: %s", e) + return None + + +# ---------- 唯一对外接口 ---------- +def search(entry: str) -> dict: + """ + 万能入口: + - UniProt AC → UniProt 记录 + - 纯数字 → NCBI Gene + - FASTA → 自动判断蛋白/核酸并 BLAST + - 其余 → 先当蛋白名搜 UniProt + 返回统一字典;找不到时 error 字段给出原因。 + """ + entry = entry.strip() + if not entry: + return {"input": entry, "error": "empty query"} + + # 1. FASTA? + if entry.startswith(">"): + record = SeqIO.read(entry.splitlines(), "fasta") + mol_type = ( + "protein" + if all(c in "ACDEFGHIKLMNPQRSTVWY" for c in str(record.seq).upper()) + else "dna" + ) + blast_res = _blast(entry, mol_type) + if blast_res is None: + return {"input": entry, "error": "BLAST 无显著匹配"} + return {"input": entry, "blast": blast_res} + + # 2. UniProt AC? + if re.fullmatch( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry + ): + uni = _fetch_uniprot(entry) + if uni is None: + return {"input": entry, "error": "UniProt 未找到"} + return {"input": entry, "uniprot": uni} + + # 3. NCBI Gene ID? + if entry.isdigit(): + gene = _fetch_ncbi_gene(entry) + if gene is None: + return {"input": entry, "error": "NCBI Gene 未找到"} + return {"input": entry, "gene": gene} + + # 4. 默认按名称搜 UniProt + uni = _fetch_uniprot(entry) + if uni: + return {"input": entry, "uniprot": uni} + return {"input": entry, "error": "未找到匹配记录"} + + +# ---------- 使用示例 ---------- +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG, format="%(levelname)s | %(message)s") + print(search("P69905")) # UniProt AC + print(search("7157")) # NCBI Gene ID + print(search(">seq\nMAAAAA")) # FASTA + print(search("interleukin-6")) # 名称 diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py index ea28c4d..d904008 100644 --- a/graphgen/templates/__init__.py +++ b/graphgen/templates/__init__.py @@ -1,3 +1,4 @@ +from .anchor import PROTEIN_ANCHOR_PROMPT from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT from .generation import ( @@ -5,9 +6,15 @@ ATOMIC_GENERATION_PROMPT, COT_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT, + PROTEIN_QA_GENERATION_PROMPT, VQA_GENERATION_PROMPT, ) -from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT +from .kg import ( + KG_EXTRACTION_PROMPT, + KG_SUMMARIZATION_PROMPT, + MMKG_EXTRACTION_PROMPT, + PROTEIN_KG_EXTRACTION_PROMPT, +) from .question_generation import QUESTION_GENERATION_PROMPT from .search_judgement import SEARCH_JUDGEMENT_PROMPT from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT diff --git a/graphgen/templates/anchor/__init__.py b/graphgen/templates/anchor/__init__.py new file mode 100644 index 0000000..3727941 --- /dev/null +++ b/graphgen/templates/anchor/__init__.py @@ -0,0 +1 @@ +from .protein_anchor import PROTEIN_ANCHOR_PROMPT diff --git a/graphgen/templates/anchor/protein_anchor.py b/graphgen/templates/anchor/protein_anchor.py new file mode 100644 index 0000000..3ea0137 --- /dev/null +++ b/graphgen/templates/anchor/protein_anchor.py @@ -0,0 +1,70 @@ +TEMPLATE_EN = """You are a biomedical entity extraction engine. +Given the text below, output **only** a JSON object that matches the following schema: + +{{ + "Protein accession or ID": string | null, + "Protein Name": string | null, + "Gene Name from literature": string | null, + "Source organism": string | null, + "Primary biological role": string | null, + "Subcellular localization (active site)": string | null, + "Inducible by": string | null, + "Post-translational modifications": string | null, + "Literature_Name": string | null, + "Experimental evidence": string | null, + "Catalytic activity:": string | null, + "Amino acids length:": int | null, + "Protein family:": string | null, + "Protein sequence:": string | null +}} + +Rules: +1. If the field cannot be found, return `null` rather than guessing. +2. Copy the exact sentence or phrase from the text; do not rephrase. +3. For boolean/number fields, convert strictly (e.g., length → integer). +4. Output **only** the JSON object, no additional words. + +Text: +=== +{chunk} +=== +""" + +TEMPLATE_ZH = """你是一个生物医学实体抽取引擎。 +根据以下文本,输出**仅**符合以下模式的JSON对象: + +```json +{ + "蛋白质登录号或ID": string | null, + "蛋白质名称": string | null, + "文献中的基因名称": string | null, + "来源生物体": string | null, + "主要生物学功能": string | null, + "亚细胞定位(活性位点)": string | null, + "诱导物": string | null, + "翻译后修饰": string | null, + "文献名称": string | null, + "实验依据": string | null, + "催化活性": string | null, + "氨基酸长度": int | null, + "蛋白质家族": string | null, + "蛋白质序列": string | null +} +``` + +规则: +1. 如果找不到该字段,返回`null`而不是猜测。 +2. 直接复制文本中的句子或短语;不要改写。 +3. 对于布尔值/数字字段,严格转换(例如,长度→整数)。 +4. 输出**仅**为JSON对象,不要添加其他文字。 + +文本: +=== +{chunk} +=== +""" + +PROTEIN_ANCHOR_PROMPT = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, +} diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py index b58c2b6..356ef83 100644 --- a/graphgen/templates/generation/__init__.py +++ b/graphgen/templates/generation/__init__.py @@ -2,4 +2,5 @@ from .atomic_generation import ATOMIC_GENERATION_PROMPT from .cot_generation import COT_GENERATION_PROMPT from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT +from .protein_qa_generation import PROTEIN_QA_GENERATION_PROMPT from .vqa_generation import VQA_GENERATION_PROMPT diff --git a/graphgen/templates/generation/protein_qa_generation.py b/graphgen/templates/generation/protein_qa_generation.py new file mode 100644 index 0000000..d7447be --- /dev/null +++ b/graphgen/templates/generation/protein_qa_generation.py @@ -0,0 +1,95 @@ +# pylint: disable=C0301 +PROTEIN_QA_TEMPLATE_EN: str = """You are a senior computational biologist specializing in structural bioinformatics. Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given protein sample described by the provided ENTITIES and RELATIONSHIPS. +Use English as the output language. + +---Objectives--- +Create multiple sets of protein-centric QA pairs that satisfy the following: +1. Only ask about objectively existing facts in the provided data (e.g., residue numbers, secondary-structure elements, binding sites, catalytic residues, domain boundaries, metal ions, experimental method, etc.). Avoid subjective or speculative questions. +2. Ensure that each question has a single, clear and verifiable answer that can be directly confirmed from the given entities/relationships. +3. Questions should cover diverse aspects: sequence, structure, function, interactions, dynamics, thermodynamics, experimental annotations, etc. +4. Avoid repetitive questions; each question must be unique and meaningful. +5. Use concise, unambiguous language; do not invent information beyond the provided data. + +---Instructions--- +1. Carefully analyse the supplied ENTITIES and RELATIONSHIPS to identify: + - macromolecular entities (protein chains, domains, motifs, ligands, ions) + - structural attributes (helices, strands, loops, cis-peptide, disulfide bonds) + - functional annotations (active site, binding site, post-translational modification) + - experimental metadata (PDB ID, resolution, method, temperature, pH) + - causal or sequential relations (e.g., "Mg2+ binding stabilises loop L3") +2. Organise information logically: + - start with sequence/primary structure + - proceed to secondary/tertiary structure + - end with function, mechanism, and experimental context +3. Maintain scientific accuracy and consistent nomenclature (standard residue numbering, atom names, etc.). +4. Review each QA pair to guarantee logical consistency and absence of hallucination. + +################ +-ENTITIES- +################ +{entities} + +################ +-RELATIONSHIPS- +################ +{relationships} +################ +Directly output the generated QA pairs below. Do NOT copy any example questions, and do NOT include extraneous text. + +Question: +Answer: + +Question: +Answer: + +""" + +PROTEIN_QA_TEMPLATE_ZH: str = """你是一位资深的结构生物信息学计算生物学家。你的任务是根据下述提供的实体与关系,为给定的蛋白质样本生成逻辑连贯、可验证、无幻觉的中英双语问答对(这里仅输出中文)。 +使用中文作为输出语言。 + +---目标--- +创建多组以蛋白质为中心的问答对,满足: +1. 仅询问数据中客观存在的事实(如残基编号、二级结构元件、结合位点、催化残基、结构域边界、金属离子、实验方法等),避免主观或推测性问题。 +2. 每个问题必须有单一、明确且可直接验证的答案,答案必须能从给定实体/关系中直接确认。 +3. 问题需覆盖:序列、结构、功能、相互作用、动力学、热力学、实验注释等多个维度,确保多样性与全面性。 +4. 避免重复提问,每个问题都独特且有意义。 +5. 语言简洁、无歧义,严禁编造超出给定数据的信息。 + +---说明--- +1. 仔细分析提供的实体与关系,识别: + - 大分子实体(蛋白链、结构域、模体、配体、离子) + - 结构属性(螺旋、折叠、环区、顺式肽键、二硫键) + - 功能注释(活性位点、结合位点、翻译后修饰) + - 实验元数据(PDB 编号、分辨率、方法、温度、pH) + - 因果或顺序关系(如“Mg²⁺ 结合稳定了环区 L3”) +2. 按逻辑顺序组织信息: + - 从序列/一级结构入手 + - 再到二级/三级结构 + - 最后到功能、机制及实验背景 +3. 保持科学准确性,使用统一命名规范(标准残基编号、原子名等)。 +4. 检查每对问答,确保逻辑一致且无幻觉。 + +################ +-实体- +################ +{entities} + +################ +-关系- +################ +{relationships} +################ +请直接在下方输出生成的问答对,不要复制任何示例,不要输出无关内容。 + +问题: <问题1> +答案: <答案1> + +问题: <问题2> +答案: <答案2> + +""" + +PROTEIN_QA_GENERATION_PROMPT = { + "en": PROTEIN_QA_TEMPLATE_EN, + "zh": PROTEIN_QA_TEMPLATE_ZH, +} diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py index ea865ce..d61fdc0 100644 --- a/graphgen/templates/kg/__init__.py +++ b/graphgen/templates/kg/__init__.py @@ -1,3 +1,4 @@ from .kg_extraction import KG_EXTRACTION_PROMPT from .kg_summarization import KG_SUMMARIZATION_PROMPT from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT +from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT diff --git a/graphgen/templates/kg/protein_kg_extraction.py b/graphgen/templates/kg/protein_kg_extraction.py new file mode 100644 index 0000000..f67917f --- /dev/null +++ b/graphgen/templates/kg/protein_kg_extraction.py @@ -0,0 +1,144 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are an expert in protein science and knowledge-graph construction. +Your task is to extract a star-shaped knowledge graph centered on **a single protein** mentioned in the given text. + +-Goal- +Given free-text that discusses one or more proteins, identify: +1. The **central protein** (the first-mentioned protein or the protein explicitly indicated by the user). +2. All entities that are **directly related** to this central protein. +3. All relationships that **directly link** those entities to the central protein (star edges). + +Use English as the output language. + +-Steps- +1. Identify the **central protein entity** and all **directly-related entities** from the text. + For the **central protein**, extract: + - entity_name: use the full name or UniProt ID if given; capitalized. + - entity_type: always `protein`. + - entity_summary: concise description of its main biological role, location, or significance in the text. + + For each **directly-related entity**, extract: + - entity_name: capitalized. + - entity_type: one of [{entity_types}]. + - entity_summary: comprehensive summary of its attributes/activities **as stated in the text**. + + Format each entity as + ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities found in Step 1, list every **(central protein → related entity)** pair that is **clearly related**. + For each pair extract: + - source_entity: the **central protein** name. + - target_entity: the related entity name. + - relationship_summary: short explanation of how the central protein is connected to this entity **according to the text**. + + Format each relationship as + ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Output a single list of all entities and relationships from Steps 1–2, using **{record_delimiter}** as the delimiter. + +4. Finish by printing {completion_delimiter} + +################ +-Example- +################ +Text: +################ +The tumor-suppressor protein p53 is a transcription factor that responds to DNA damage. +Phosphorylation of p53 by ATM kinase at serine-15 enhances its stability. +MDM2, an E3 ubiquitin ligase, negatively regulates p53 via ubiquitination. +################ +Output: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"Tumor-suppressor transcription factor that responds to DNA damage and is regulated by post-translational modifications."){record_delimiter} +("entity"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"protein"{tuple_delimiter}"Protein kinase that phosphorylates p53 at serine-15, thereby enhancing p53 stability."){record_delimiter} +("entity"{tuple_delimiter}"serine-15"{tuple_delimiter}"site"{tuple_delimiter}"Phosphorylation site on p53 that is targeted by ATM kinase."){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3 ubiquitin ligase that negatively regulates p53 through ubiquitination."){record_delimiter} +("entity"{tuple_delimiter}"DNA damage"{tuple_delimiter}"concept"{tuple_delimiter}"Cellular stress signal that activates p53-mediated transcriptional response."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"ATM kinase phosphorylates p53, enhancing its stability."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"serine-15"{tuple_delimiter}"p53 is phosphorylated at serine-15 by ATM kinase."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2 ubiquitinates p53, negatively regulating its activity."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA damage"{tuple_delimiter}"p53 acts as a sensor-transcription factor in response to DNA damage."){completion_delimiter} + +################ +-Real Data- +Entity_types: {entity_types} +Text: {input_text} +################ +Output: +""" + + +TEMPLATE_ZH: str = """您是蛋白质科学与知识图谱构建专家。 +任务:从给定文本中抽取以**一个中心蛋白质**为核心的星型知识图谱。 + +-目标- +文本可能提及一个或多个蛋白质,请: +1. 确定**中心蛋白质**(文本首个提及或用户指定的蛋白)。 +2. 识别所有与中心蛋白**直接相关**的实体。 +3. 仅保留**中心蛋白→相关实体**的直接关系(星型边)。 + +使用中文输出。 + +-步骤- +1. 确定**中心蛋白质实体**及所有**直接相关实体**。 + 对于**中心蛋白质**: + - entity_name:全名或UniProt ID,首字母大写。 + - entity_type:固定为`protein`。 + - entity_summary:简述其在文中的生物学功能、定位或意义。 + + 对于每个**直接相关实体**: + - entity_name:首字母大写。 + - entity_type:可选类型[{entity_types}]。 + - entity_summary:全面总结其在文中与中心蛋白相关的属性/活动。 + + 格式:("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 在步骤1的实体中,列出所有**(中心蛋白→相关实体)**的明显关系对。 + 每对提取: + - source_entity:中心蛋白名称。 + - target_entity:相关实体名称。 + - relationship_summary:简要说明文中二者如何直接关联。 + + 格式:("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 将步骤1–2的所有实体与关系合并为单列表,用**{record_delimiter}**分隔。 + +4. 输出结束标记{completion_delimiter} + +################ +-示例- +################ +文本: +################ +肿瘤抑制蛋白p53是一种转录因子,可响应DNA损伤。ATM激酶在第15位丝氨酸磷酸化p53,增强其稳定性。E3泛素连接酶MDM2通过泛素化负调控p53。 +################ +输出: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"肿瘤抑制转录因子,能感知DNA损伤并通过翻译后修饰被调控。"){record_delimiter} +("entity"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"protein"{tuple_delimiter}"蛋白激酶,在丝氨酸-15位点磷酸化p53,从而提高其稳定性。"){record_delimiter} +("entity"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"site"{tuple_delimiter}"p53上被ATM激酶靶向的磷酸化位点。"){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3泛素连接酶,通过泛素化负调控p53。"){record_delimiter} +("entity"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"concept"{tuple_delimiter}"细胞内应激信号,可激活p53介导的转录应答。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"ATM激酶磷酸化p53,增强其稳定性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"p53在该位点被ATM激酶磷酸化。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2对p53进行泛素化,负向调控其活性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"p53作为感受器-转录因子响应DNA损伤。"){completion_delimiter} + +################ +-真实数据- +实体类型:{entity_types} +文本:{input_text} +################ +输出: +""" + + +PROTEIN_KG_EXTRACTION_PROMPT: dict = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "protein, gene, site, modification, pathway, disease, drug, organism, tissue, cell_line, " + "experiment, technology, concept, location, organization, person, mission, science", + }, +} diff --git a/resources/input_examples/protein_qa_demo.txt b/resources/input_examples/protein_qa_demo.txt new file mode 100644 index 0000000..cbc60df --- /dev/null +++ b/resources/input_examples/protein_qa_demo.txt @@ -0,0 +1,53 @@ +# Harpin, Elicitor of the Hypersensitive Response Produced by the Plant Pathogen Erwinia amylovora + +Zhong-Min Wei, Ron J. Laby, Cathy H. Zumoff, David W. Bauer, Sheng Yang He, Alan Collmer, Steven V. Beer\* + +A proteinaceous elicitor of the plant defense reaction known as the hypersensitive response was isolated from Erwinia amylovora, the bacterium that causes fire blight of pear, apple, and other rosaceous plants. The elicitor, named harpin, is an acidic, heat-stable, cell-envelope-associated protein with an apparent molecular weight of 44 kilodaltons. Harpin caused tobacco leaf lamina to collapse and caused an increase in the pH of bathing solutions of suspension-cultured tobacco cells. The gene encoding harpin (hrpN) was located in the 40-kilobase hrp gene cluster of E. amylovora, sequenced, and mutated with Tn5tac1. The hrpN mutants were not pathogenic to pear, did not elicit the hypersensitive response, and did not produce harpin. + +The hypersensitive response (HR) of higher plants is characterized by the rapid, localized death of tissues containing an incompatible pathogen (a microorganism that is pathogenic only on other plants). It is associated with the defense of plants against many bacteria, fungi, nematodes, and viruses (1). The molecular basis for HR is unknown, but physiological and genetic observations with bacteria suggest that the same factor that elicits HR in nonhosts also is required for pathogenicity in hosts (1, 2). Production of this factor is controlled by hrp genes, which are highly conserved among many species of plant pathogenic bacteria (2, 3). Functional clusters of hrp genes have been cloned from Erwinia amylovora and Pseudomonas syringae and have been shown to confer on nonpathogenic bacteria the ability to elicit HR after infltration of bacterial suspensions into the intercellular spaces of leaves of tobacco and other plants (4, 5). + +The hrp gene cluster from E. amylovora (Fig: 1), contained in the cosmid pCPP430, is expressed particularly well in Escherichia coli (4). We report here the isolation of a proteinaceous elicitor of the HR from Escherichia coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ and from E. amylovora, the bacterium that causes the devastating disease of apple, pear, and other rosaceous plants known as fire blight (6). We propose the name harpin for the HR-elicitor from E. amylovora and hrpN for the gene that encodes it. + +Tobacco leaves infiltrated (7) with cellfree culture supernatants of E. amylovora Ea321, Ea321(pCPP430) or E. coli $\mathrm { \bar { \ D H } } 5 \mathbf { \alpha }$ (pCPP430) showed no HR, whereas leaves infiltrated with a centrifuged and filtersterilized preparation from sonicated cells of E. coli $\bar { \mathsf { D H } } \bar { 5 } \alpha ( \mathsf { p C P P } 4 3 0 )$ showed a strong HR within 12 hours (Fig. 2). The HReliciting activity of this cell-free elicitor + +13). Harpin also elicited HR in tomato and Arabidopsis thaliana. Purified harpin was protease-sensitive, heat-stable, and had a pI of 4.3 on thin-layer IEF gels (13). + +preparation (CFEP) (8) was heat-stable but highly sensitive to protease (9). + +Further purifcation was achieved by holding the CFEP in a boiling-water bath for $1 0 ~ \mathrm { \ m i n }$ and removing the insoluble material by centrifugation. The remaining soluble material was electrophoresed on an SDS-polyacrylamide gel (Fig. 3). A protein band corresponding to ${ 4 4 \ k D }$ was uniquely present in all preparations with HR-eliciting activity (Fig. 3). Similarly, after resolution of the CFEP on an isoelectric-focusing (IEF) granulated gel bed (10) or by ionexchange chromatography (1 1), the fractions with HR-eliciting activity always contained a $4 4 - \mathbf { k D }$ protein. + +The $4 4 - \mathbf { k D }$ protein, named harpin, was electroeluted from a preparative SDS-polyacrylamide gel (12). At concentrations ${ \ge } 5 0 0 \mathbf { n } \mathbf { M }$ $( \geq 2 5 ~ | \mathbf { \mu } \mathbf { g } / \mathbf { m l } )$ , harpin elicited HR in leaves of tobacco (Fig. 2, sectors 6 and + +Because supernatants from E. amylovora Ea321(pCPP430) or E. coli DH5α (pCPP430) did not elicit HR, we postulated that harpin was not secreted but rather was present in or on the bacteria. Whole bacteria treated with protease failed to elicit HR, whereas bacteria incubated with protease together with $0 . 5 ~ \mathrm { m M }$ phenylmethylsulfonyl fluoride (PMSF, a protease inhibitor) did (Table 1). Treatment of bacteria with increasing amounts of protease resulted in a decreased ability to elicit HR that correlated with the disappearance of harpin detectable in SDS-polyacrylamide gels (Table 1). After centrifugation of CFEP at $_ { 1 0 5 , 0 0 0 g }$ for 1 hour, most HR-eliciting activity was found in the supernatant. However, when the cell suspension was brought to $3 0 \mathrm { \ m M \ M g C l } _ { 2 }$ ,before sonication, most activity was associated with the sedimented membrane fraction. Gel-permeation chromatography of unheated CFEP also indicated association of the elicitor with a high molecular weight $( > 1 0 ^ { 6 }$ daltons) fraction, probably membrane vesicles (14). Only the membrane fraction of E. amylovora Ea321(pCPP430) reacted with an antiserum raised in response to harpin (15), further supporting the cell-envelope location of harpin (Fig. 4). + +Cell suspensions of $\mathtt { E a } 3 2 1 ( \mathsf { p C P P 4 } 3 0 )$ or $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ in log-phase maintained their HR-eliciting activity for less than 0.5 hour and 1 hour, respectively, in the presence of tetracycline $( 4 0 ~ \mu \mathrm { g } / \mathrm { m l } )$ , an inhibitor of protein synthesis. Once cells lost + +insertions to the left of hrpN abolish pathogenicity to pear and eliminate or greatly disrupt HR-elicitation. Flags indicate the orientation of the outward-facing $P _ { \mathrm { t a c } }$ promoter of Tn5tac1 $( 2 8 )$ in the tac1-1 and tac1-2 insertions. The two tac1 and K49 mutations were marker-exchanged into the E. amylovora 321 genome. The mutations abolished the ability of E. amylovora to cayse disease in immature pear fruit, to elicit HR in tobacco leaves, and to produce harpin; similarly, they abolished the ability of E. coli DH5a(pCPP43o) to elicit HR and to produce harpin. The effect of the K49 mutation on harpin production probably is regulatory. (B) Deduced amino acid sequence of harpin. The DNA sequence is accessioned in GenBank (#M92994). Amino acids confirmed by $N H _ { 2 } -$ terminal sequence analysis of the purified protein are underlined. + +HR-eliciting activity, harpin was not detected. However, when the protease inhibitor PMSF $( 0 . 5 \mathrm { \ m M } )$ was included, the bacteria retained HR-eliciting activity and possessed detectable harpin for more than 2 hours. More protease was required per cell to destroy harpin produced by E. coli $\mathsf { D H S } \alpha ( \mathsf { p C P P } 4 3 0 )$ than by Ea321(pCPP430), suggesting that E. coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ produces more harpin or degrades it more slowly, or both. + +The ability of bacterial strains to elicit the HR in intact tobacco leaves is related genetically to their ability to elicit a $\mathbf { K } ^ { + } / \mathbf { H } ^ { + }$ exchange reaction (XR) in tobacco cell suspension cultures (TCSCs) (16); both reactions require the hrp gene cluster (17). + +We tested the ability of harpin to raise the pH of TCSC bathing solution, an indicator of the XR (Fig. 5). Cells of E. amylovora, grown in rich medium and added to TCSCs caused an increase in pH of the bathing solution after 2 to 3 hours. Addition of purified harpin caused an increase in pH within 1 hour. Erwinia amylovora mutant Ea321K49, which did not produce harpin in culture, and strains of E. coli containing mutated hrp gene clusters failed to elicit the XR. + +Harpin also was isolated from E. amylovora Ea321, which had been preincubated for 5 hours in an HR-inducing medium (18). Harpin from E. amylovora Ea321 was identical in physical and biological properties to that isolated from E. coli $\mathrm { \bar { D H } } 5 \mathbf { \alpha }$ (pCPP430). No protease-sensitive, heatstable, HR-eliciting activity, associated with a 44-kD protein, was seen in cell-free extracts from E. coli $ { \mathbf Ḋ \mathrm { H } } 5 { \mathbf Ḋ \mathrm { d } } ( { \mathbf Ḋ \mathrm { C P P } } 9 ) $ , which harbors the vector of pCPP430 (Fig. 3, lane 5). On the basis of the visual intensities of the 44-kD bands on SDS-polyacrylamide gels (Fig. 3, lanes 7 and 8), we estimate that Ea321 produces about $10 \%$ as much harpin as does E. coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ . + +The structural gene encoding harpin, designated hrpN, was located within the hrp gene cluster (Fig. 1) by hybridization with an oligonucleotide probe corresponding to the ninth to the fifteenth amino acid residues from the $\mathrm { N H } _ { 2 }$ -terminus of harpin (19) . The 1.3-kb Hind 'II fragment that hybridized was cloned into pBluescript $\ M \ 1 3 +$ (Stratagene, La Jolla, California) and designated pCPP1084. A unique $4 4 - \mathbf { k D }$ protein, which was immunoprecipitated by antiserum raised against harpin (15), was + +Table 1. Protease sensitivity of the HR-eliciting activity of whole cells of E. amylovora Ea321(pCPP430). Cells were grown in LB medium, harvested'by centrifugation, and resuspended in 0.1 volume of $5 m M$ potassium phosphate $( \mathsf { p H } \thinspace 6 . 5 )$ containing tetracycline (40 $\mu { \sf g } / { \sf m } 1 )$ . After incubation with protease (Sigma P5147), as indicated, at $\mathfrak { s } 7 ^ { \circ } \mathfrak { C }$ for 5 min, $1 0 0 ~ \mu !$ of each cell suspension was infiltrated into tobacco leaves. Leaf sector collapse was assayed at 24 hours. At the time of infiltration, portions of protease-treated cell mixtures were iysed, held'in boiling water for 10 min, centrifuged for 10 min at $1 2 . 0 0 0 g .$ and electrophoresed on a $10 \%$ SDS-polyacrylamide gel to detect harpin. Electrophoresis was done for 2 hours at $1 5 m \mathsf { A }$ followed by staining with Coomassie blue R-250. Cell-free supernatant, produced from the LB culture, was filter-sterilized and then concentrated with the Centriprep-10 (Amicon, Danvers, Massachusetts). + +
Protease per milliterTissue collapseHarpin detected
0++
5μg++
10μg++
20 μgWeak+
40 μg-
80μg
80μg + 0.5 mM PMSF++
Cell-free supernatant
+ +expressed fom pCPP1084 in the T7RNA (20). Insertions of Tn5tac1 in hrpN (21) (Fig. 1) abolished the ability of E. coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ to elicit HR on tobacco or produce harpin detectable on Western blots. Ea321T5, a derivative of E. amylo + +at $\pmb { 1 0 0 ^ { \circ } } \pmb { \mathbb { C } }$ for 10 min; 8, CFEP $( 5 ~ \mu 9 )$ from E. coli DH5a(pCPP430) treated at $1 0 0 ^ { \circ } \mathsf { C }$ for 10 min; 9, CFEP $( 5 ~ \mu 9 )$ from E. amylovora Ea321K49 treated at $_ { 1 0 0 ^ { \circ } \mathbb { C } }$ for 10 min. Samples from the preparations in lanes 3, 4, 7, and 8 elicited HR in tobacco leaves. Samples were prepared as described (8) and brought to 125 mM tris-HCI $( \mathsf { p H } 6 . 8 )$ $4 \%$ SDS, $20 \%$ glycerol, boiled for 3 min, then electrophoresed through a $10 \%$ (w/v) polyacrylamide gel with $0 . 1 \%$ SDS at $1 5 m A$ for 2 hours in a Mighty Small apparatus according to instructions (Hoefer Scientific Instruments, San Francisco, California). The gel was stained with $0 . 0 2 5 \%$ Coomassie Blue R-250. Low-range prestained molecular weight standards (Bio-Rad 161-0305) were used and calibrated with an unstained protein marker (Bio-Rad 161-0304). Arrow indicates region corresponding to $4 4 \ k \mathsf$ + +HR in tobacco leaves or to cause fire blight in highly susceptible immature pear fruits (22). Both pathogenic and HR-eliciting abilities were restored to Ea321T5 by pCPP1084, in trans, and the resulting strains produced harpin. + +DNA sequence data from the $1 . 3 – \mathbf { k } \mathbf { b }$ Hind II fragment revealed that hrpN is 1155 base pairs long, and it encodes a 385–amino acid protein (Fig. 1). The 15 $\mathrm { N H } _ { 2 }$ -terminal residues revealed by amino acid sequencing corresponded to those deduced from the DNA' sequence (Fig. 1). The deduced amino acid sequence of harpin (Fig. 1), which corresponded closely with the analyzed amino acid composition, reveals a glycine-rich protein with a high degree of hydrophilicity. It appears to have an open structure, which may explain its heat stability and sensitivity to proteases. A FASTA search (23) of GenBank for similar proteins revealed similarity only with other glycine-rich proteins, such as several plant cell wall proteins and keratins. + +The properties of the E. amylovora harpin protein are consistent with numerous physiological observations that were made after the discovery in 1963 that bacteria can elicit HR (24). These have indicated that elicitation of HR requires de novo protein synthesis and production of a labile factor by bacteria in close proximity to plant cells, and that each bacterial cell typically kills only one plant cell, thus explaining the requirement for high numbers of bacteria to cause death of enough plant cells to produce macroscopically visible collapse of the tissue (1, 25). + +The nonpathogenic phenotype of the hrpN mutation in Ea321T5 also is characteristic of hrp mutations in other phytopathogenic bacteria (2) and indicates that harpin is a primary determinant of pathogenicity in E. amylovora. That harpin has an essential role in both disease and plant defense reactions is puzzling but may be based on differential proteolysis or differential expression of hrp genes in host and nonhost plants (18). + +Toxins, plant cell wall-degrading enzymes, and phytohormones contribute to the virulence (degree of pathogenicity) of certain members of the important group of phytopathogenic bacteria that possess limited host ranges and produce necrotic lesions after multiplication in compatible hosts (26). The hrp genes, in contrast, are absolutely required for pathogenicity by these bacteria, which include species of Erwinia, Pseudomonas, and Xanthomonas. The conservation of the hrp genes suggests that the E. amylovora harpin may be the archetypical disease determinant for these pathogens. diff --git a/scripts/generate/generate_protein_qa.sh b/scripts/generate/generate_protein_qa.sh new file mode 100644 index 0000000..2c1a738 --- /dev/null +++ b/scripts/generate/generate_protein_qa.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.generate \ +--config_file graphgen/configs/protein_qa_config.yaml \ +--output_dir cache/ From 2192ee8a69bac6acbf6cd332a1f5f09499103dc2 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 30 Oct 2025 19:25:35 +0800 Subject: [PATCH 4/8] fix: fix lint errors --- graphgen/graphgen.py | 20 ++++---- graphgen/operators/build_kg/build_kg.py | 4 +- graphgen/operators/build_kg/build_mo_kg.py | 55 +++------------------- 3 files changed, 18 insertions(+), 61 deletions(-) diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 35772d5..4f9a8d8 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -108,16 +108,16 @@ async def insert(self, read_config: Dict, split_config: Dict): self.progress_bar, ) - # _add_chunk_keys = await self.chunks_storage.filter_keys( - # list(inserting_chunks.keys()) - # ) - # inserting_chunks = { - # k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys - # } - # - # if len(inserting_chunks) == 0: - # logger.warning("All chunks are already in the storage") - # return + _add_chunk_keys = await self.chunks_storage.filter_keys( + list(inserting_chunks.keys()) + ) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } + + if len(inserting_chunks) == 0: + logger.warning("All chunks are already in the storage") + return logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks)) await self.chunks_storage.upsert(inserting_chunks) diff --git a/graphgen/operators/build_kg/build_kg.py b/graphgen/operators/build_kg/build_kg.py index 762e798..1f94d2e 100644 --- a/graphgen/operators/build_kg/build_kg.py +++ b/graphgen/operators/build_kg/build_kg.py @@ -2,9 +2,9 @@ import gradio as gr +from graphgen.bases import BaseLLMWrapper from graphgen.bases.base_storage import BaseGraphStorage from graphgen.bases.datatypes import Chunk -from graphgen.models import OpenAIClient from graphgen.utils import logger from .build_mm_kg import build_mm_kg @@ -13,7 +13,7 @@ async def build_kg( - llm_client: OpenAIClient, + llm_client: BaseLLMWrapper, kg_instance: BaseGraphStorage, chunks: List[Chunk], anchor_type: Optional[str] = None, diff --git a/graphgen/operators/build_kg/build_mo_kg.py b/graphgen/operators/build_kg/build_mo_kg.py index 31d479d..a98ee54 100644 --- a/graphgen/operators/build_kg/build_mo_kg.py +++ b/graphgen/operators/build_kg/build_mo_kg.py @@ -1,25 +1,17 @@ import json -import re from typing import List import gradio as gr +from graphgen.bases import BaseLLMWrapper from graphgen.bases.base_storage import BaseGraphStorage from graphgen.bases.datatypes import Chunk -from graphgen.models import OpenAIClient from graphgen.templates import PROTEIN_ANCHOR_PROMPT, PROTEIN_KG_EXTRACTION_PROMPT -from graphgen.utils import ( - detect_main_language, - handle_single_entity_extraction, - handle_single_relationship_extraction, - logger, - run_concurrent, - split_string_by_multi_markers, -) +from graphgen.utils import detect_main_language, logger, run_concurrent async def build_mo_kg( - llm_client: OpenAIClient, + llm_client: BaseLLMWrapper, kg_instance: BaseGraphStorage, chunks: List[Chunk], progress_bar: gr.Progress = None, @@ -73,7 +65,6 @@ async def extract_mo_info(chunk: Chunk): # logger.warning("Failed to search for protein info: %s", e) # search_results = {} - # 组织成文本 mo_text = "\n".join([f"{k}: {v}" for k, v in merged.items()]) lang = detect_main_language(mo_text) prompt = PROTEIN_KG_EXTRACTION_PROMPT[lang].format( @@ -81,40 +72,6 @@ async def extract_mo_info(chunk: Chunk): **PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"], ) kg_output = await llm_client.generate_answer(prompt) - - logger.debug("Image chunk extraction result: %s", kg_output) - - # parse the result - records = split_string_by_multi_markers( - kg_output, - [ - PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], - PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], - ], - ) - - print(records) - raise NotImplementedError - - nodes = defaultdict(list) - edges = defaultdict(list) - - for record in records: - match = re.search(r"\((.*)\)", record) - if not match: - continue - inner = match.group(1) - - attributes = split_string_by_multi_markers( - inner, [PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] - ) - - entity = await handle_single_entity_extraction(attributes, "temp") - if entity is not None: - nodes[entity["entity_name"]].append(entity) - continue - - relation = await handle_single_relationship_extraction(attributes, "temp") - if relation is not None: - key = (relation["src_id"], relation["tgt_id"]) - edges[key].append(relation) + print(kg_output) + # TODO: parse kg_output and insert into kg_instance + return kg_instance From 96be73a2de2e4501526468d1246e5a54c582645f Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 30 Oct 2025 19:53:41 +0800 Subject: [PATCH 5/8] delete search_mo --- graphgen/operators/build_kg/search_mo.py | 182 ----------------------- 1 file changed, 182 deletions(-) delete mode 100644 graphgen/operators/build_kg/search_mo.py diff --git a/graphgen/operators/build_kg/search_mo.py b/graphgen/operators/build_kg/search_mo.py deleted file mode 100644 index 21164e3..0000000 --- a/graphgen/operators/build_kg/search_mo.py +++ /dev/null @@ -1,182 +0,0 @@ -# multi_omics_search.py -import logging -import re -import time -from typing import Dict, Optional - -import requests -from Bio import SeqIO -from Bio.Blast import NCBIWWW, NCBIXML -from requests import Session, adapters -from urllib3.util.retry import Retry - -# ---------- 底层工具 ---------- -_SESSION: Optional[Session] = None - - -def _get_session() -> Session: - global _SESSION - if _SESSION is None: - _SESSION = Session() - retry = Retry( - total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504] - ) - _SESSION.mount("https://", adapters.HTTPAdapter(max_retries=retry)) - _SESSION.headers.update({"User-Agent": "MultiOmicsQuery/1.0"}) - return _SESSION - - -# ---------- 数据抓取 ---------- -def _fetch_uniprot(entry: str) -> Optional[Dict]: - entry = entry.strip() - if re.fullmatch( - r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry - ): - url = f"https://www.uniprot.org/uniprot/{entry}.json" - r = _get_session().get(url, timeout=15) - if r.ok: - return _parse_uniprot(r.json()) - # 模糊搜索 - kw = entry.upper().replace("-", "") - if kw == "INTERLEUKIN6": - kw = "IL6" - r = _get_session().get( - "https://www.uniprot.org/uniprot/", - params={ - "query": f"gene:{kw} OR name:{kw} OR {kw}", - "format": "json", - "limit": 1, - }, - timeout=15, - ) - if not r.ok or not r.json().get("results"): - return None - acc = r.json()["results"][0]["primaryAccession"] - return _fetch_uniprot(acc) # 递归拿详情 - - -def _parse_uniprot(data: dict) -> dict: - return { - "molecule_type": "protein", - "database": "UniProt", - "id": data["primaryAccession"], - "entry_name": data["uniProtkbId"], - "gene_names": [ - g["geneName"]["value"] for g in data.get("genes", []) if "geneName" in g - ], - "protein_name": ( - data["proteinDescription"]["recommendedName"]["fullName"]["value"] - if "recommendedName" in data["proteinDescription"] - else data["proteinDescription"]["submissionNames"][0]["fullName"]["value"] - ), - "organism": data["organism"]["scientificName"], - "sequence": data["sequence"]["value"], - "function": " | ".join( - [ - c["texts"][0]["value"] - for c in data.get("comments", []) - if c["commentType"] == "FUNCTION" - ] - ), - "url": f"https://www.uniprot.org/uniprot/{data['primaryAccession']}", - } - - -def _fetch_ncbi_gene(gene_id: str) -> Optional[Dict]: - url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" - params = {"db": "gene", "id": gene_id, "retmode": "json"} - r = requests.get(url, params=params, timeout=15) - if not r.ok: - return None - data = r.json()["result"][gene_id] - return { - "molecule_type": "gene", - "database": "NCBI Gene", - "id": gene_id, - "symbol": data["name"], - "description": data["description"], - "organism": data["organism"]["scientificname"], - "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", - } - - -def _blast(fasta: str, mol_type: str) -> Optional[Dict]: - program = "blastp" if mol_type == "protein" else "blastn" - try: - result = NCBIWWW.qblast(program, "nr", fasta, hitlist_size=1, format_type="XML") - rec = next(NCBIXML.parse(result)) - if not rec.alignments: - return None - best = rec.alignments[0] - hit_id = best.hit_id.split("|")[3] if "|" in best.hit_id else best.hit_id - return { - "molecule_type": mol_type, - "database": "NCBI BLAST", - "hit_id": hit_id, - "hit_title": best.hit_def, - "hit_score": best.hsps[0].score, - "hit_evalue": best.hsps[0].expect, - "url": f"https://www.ncbi.nlm.nih.gov/protein/{hit_id}", - } - except Exception as e: - logging.warning("BLAST 失败: %s", e) - return None - - -# ---------- 唯一对外接口 ---------- -def search(entry: str) -> dict: - """ - 万能入口: - - UniProt AC → UniProt 记录 - - 纯数字 → NCBI Gene - - FASTA → 自动判断蛋白/核酸并 BLAST - - 其余 → 先当蛋白名搜 UniProt - 返回统一字典;找不到时 error 字段给出原因。 - """ - entry = entry.strip() - if not entry: - return {"input": entry, "error": "empty query"} - - # 1. FASTA? - if entry.startswith(">"): - record = SeqIO.read(entry.splitlines(), "fasta") - mol_type = ( - "protein" - if all(c in "ACDEFGHIKLMNPQRSTVWY" for c in str(record.seq).upper()) - else "dna" - ) - blast_res = _blast(entry, mol_type) - if blast_res is None: - return {"input": entry, "error": "BLAST 无显著匹配"} - return {"input": entry, "blast": blast_res} - - # 2. UniProt AC? - if re.fullmatch( - r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry - ): - uni = _fetch_uniprot(entry) - if uni is None: - return {"input": entry, "error": "UniProt 未找到"} - return {"input": entry, "uniprot": uni} - - # 3. NCBI Gene ID? - if entry.isdigit(): - gene = _fetch_ncbi_gene(entry) - if gene is None: - return {"input": entry, "error": "NCBI Gene 未找到"} - return {"input": entry, "gene": gene} - - # 4. 默认按名称搜 UniProt - uni = _fetch_uniprot(entry) - if uni: - return {"input": entry, "uniprot": uni} - return {"input": entry, "error": "未找到匹配记录"} - - -# ---------- 使用示例 ---------- -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG, format="%(levelname)s | %(message)s") - print(search("P69905")) # UniProt AC - print(search("7157")) # NCBI Gene ID - print(search(">seq\nMAAAAA")) # FASTA - print(search("interleukin-6")) # 名称 From 256acc11ea87df39121c004823af260f48a5bea6 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 3 Nov 2025 15:32:11 +0800 Subject: [PATCH 6/8] feat: add mo_kg_builder --- graphgen/configs/protein_qa_config.yaml | 2 +- graphgen/models/__init__.py | 2 +- graphgen/models/kg_builder/__init__.py | 1 + graphgen/models/kg_builder/mo_kg_builder.py | 81 +++++++++- graphgen/models/search/db/uniprot_search.py | 148 ++++++++++++------ graphgen/operators/build_kg/build_kg.py | 29 ++-- graphgen/operators/build_kg/build_mo_kg.py | 67 +++----- graphgen/operators/search/db/__init__.py | 0 .../operators/search/db/search_uniprot.py | 0 .../operators/search/multi_omics_search.py | 29 ++++ requirements.txt | 50 +++--- resources/input_examples/protein_qa_demo.json | 64 ++++++++ resources/input_examples/protein_qa_demo.txt | 53 ------- 13 files changed, 347 insertions(+), 179 deletions(-) delete mode 100644 graphgen/operators/search/db/__init__.py delete mode 100644 graphgen/operators/search/db/search_uniprot.py create mode 100644 graphgen/operators/search/multi_omics_search.py create mode 100644 resources/input_examples/protein_qa_demo.json delete mode 100644 resources/input_examples/protein_qa_demo.txt diff --git a/graphgen/configs/protein_qa_config.yaml b/graphgen/configs/protein_qa_config.yaml index 4f1f237..d98f47c 100644 --- a/graphgen/configs/protein_qa_config.yaml +++ b/graphgen/configs/protein_qa_config.yaml @@ -1,5 +1,5 @@ read: - input_file: resources/input_examples/protein_qa_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + input_file: resources/input_examples/protein_qa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples anchor_type: protein # get protein information from chunks split: chunk_size: 1024 # chunk size for text splitting diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 0869416..7ae5379 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -6,7 +6,7 @@ MultiHopGenerator, VQAGenerator, ) -from .kg_builder import LightRAGKGBuilder, MMKGBuilder +from .kg_builder import LightRAGKGBuilder, MMKGBuilder, MOKGBuilder from .llm import HTTPClient, OllamaClient, OpenAIClient from .partitioner import ( AnchorBFSPartitioner, diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py index 1e7e2c4..330fd44 100644 --- a/graphgen/models/kg_builder/__init__.py +++ b/graphgen/models/kg_builder/__init__.py @@ -1,2 +1,3 @@ from .light_rag_kg_builder import LightRAGKGBuilder from .mm_kg_builder import MMKGBuilder +from .mo_kg_builder import MOKGBuilder diff --git a/graphgen/models/kg_builder/mo_kg_builder.py b/graphgen/models/kg_builder/mo_kg_builder.py index dfbb44c..da46616 100644 --- a/graphgen/models/kg_builder/mo_kg_builder.py +++ b/graphgen/models/kg_builder/mo_kg_builder.py @@ -1,11 +1,36 @@ +import re +from collections import defaultdict from typing import Dict, List, Tuple from graphgen.bases import Chunk +from graphgen.templates import PROTEIN_KG_EXTRACTION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + split_string_by_multi_markers, +) from .light_rag_kg_builder import LightRAGKGBuilder class MOKGBuilder(LightRAGKGBuilder): + @staticmethod + async def scan_document_for_schema( + chunk: Chunk, schema: Dict[str, List[str]] + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Scan the document chunk to extract entities and relationships based on the provided schema. + :param chunk: The document chunk to be scanned. + :param schema: A dictionary defining the entities and relationships to be extracted. + :return: A tuple containing two dictionaries - one for entities and one for relationships. + """ + # TODO: use hard-coded PROTEIN_KG_EXTRACTION_PROMPT for protein chunks, + # support schema for other chunk types later + print(chunk.id, schema) + return {}, {} + async def extract( self, chunk: Chunk ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: @@ -19,5 +44,57 @@ async def extract( :return: Tuple containing entities and relationships. """ # TODO: Implement the multi-omics KG extraction logic here - print(chunk) - return {}, {} + chunk_id = chunk.id + chunk_type = chunk.type # genome | protein | ... + metadata = chunk.metadata + + # choose different extraction strategies based on chunk type + if chunk_type == "protein": + protein_caption = "" + for key, value in metadata["protein_caption"].items(): + protein_caption += f"{key}: {value}\n" + logger.debug("Protein chunk caption: %s", protein_caption) + + language = detect_main_language(protein_caption) + prompt_template = PROTEIN_KG_EXTRACTION_PROMPT[language].format( + **PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"], + input_text=protein_caption, + ) + result = await self.llm_client.generate_answer(prompt_template) + logger.debug("Protein chunk extraction result: %s", result) + + # parse the result + records = split_string_by_multi_markers( + result, + [ + PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction( + attributes, chunk_id + ) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) diff --git a/graphgen/models/search/db/uniprot_search.py b/graphgen/models/search/db/uniprot_search.py index daf4224..6bbf3f8 100644 --- a/graphgen/models/search/db/uniprot_search.py +++ b/graphgen/models/search/db/uniprot_search.py @@ -1,61 +1,117 @@ -import requests -from fastapi import HTTPException +from io import StringIO +from typing import Dict, Optional -from graphgen.utils import logger +from Bio import ExPASy, SeqIO, SwissProt, UniProt +from Bio.Blast import NCBIWWW, NCBIXML -UNIPROT_BASE = "https://rest.uniprot.org/uniprotkb/search" +from graphgen.utils import logger class UniProtSearch: """ UniProt Search client to search with UniProt. 1) Get the protein by accession number. - 2) Search with keywords or protein names. + 2) Search with keywords or protein names (fuzzy search). """ - def get_entry(self, accession: str) -> dict: + def get_by_accession(self, accession: str) -> Optional[dict]: + try: + handle = ExPASy.get_sprot_raw(accession) + record = SwissProt.read(handle) + handle.close() + return self._swissprot_to_dict(record) + except Exception as exc: # pylint: disable=broad-except + logger.error("Accession %s not found: %s", accession, exc) + return None + + @staticmethod + def _swissprot_to_dict(record: SwissProt.Record) -> dict: + """error + Convert a SwissProt.Record to a dictionary. """ - Get the UniProt entry by accession number(e.g., P04637). + functions = [] + for line in record.comments: + if line.startswith("FUNCTION:"): + functions.append(line[9:].strip()) + + return { + "molecule_type": "protein", + "database": "UniProt", + "id": record.accessions[0], + "entry_name": record.entry_name, + "gene_names": record.gene_name, + "protein_name": record.description.split(";")[0].split("=")[-1], + "organism": record.organism.split(" (")[0], + "sequence": str(record.sequence), + "function": functions, + "url": f"https://www.uniprot.org/uniprot/{record.accessions[0]}", + } + + def get_best_hit(self, keyword: str) -> Optional[Dict]: """ - url = f"{UNIPROT_BASE}/{accession}.json" - return self._safe_get(url).json() - - def search( - self, - query: str, - *, - size: int = 10, - cursor: str = None, - fields: list[str] = None, - ) -> dict: + Search UniProt with a keyword and return the best hit. + :param keyword: The search keyword. + :return: A dictionary containing the best hit information or None if not found. """ - Search UniProt with a query string. - :param query: The search query. - :param size: The number of results to return. - :param cursor: The cursor for pagination. - :param fields: The fields to return in the response. - :return: A dictionary containing the search results. + if not keyword.strip(): + return None + + try: + iterator = UniProt.search(keyword, fields=None, batch_size=1) + hit = next(iterator, None) + if hit is None: + return None + return self.get_by_accession(hit["primaryAccession"]) + + except Exception as e: # pylint: disable=broad-except + logger.error("Keyword %s not found: %s", keyword, e) + return None + + def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: """ - params = { - "query": query, - "size": size, - } - if cursor: - params["cursor"] = cursor - if fields: - params["fields"] = ",".join(fields) - url = UNIPROT_BASE - return self._safe_get(url, params=params).json() + Search UniProt with a FASTA sequence and return the best hit. + :param fasta_sequence: The FASTA sequence. + :param threshold: E-value threshold for BLAST search. + :return: A dictionary containing the best hit information or None if not found. + """ + try: + if fasta_sequence.startswith(">"): + seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq) + else: + seq = fasta_sequence.strip() + except Exception as e: # pylint: disable=broad-except + logger.error("Invalid FASTA sequence: %s", e) + return None - @staticmethod - def _safe_get(url: str, params: dict = None) -> requests.Response: - r = requests.get( - url, - params=params, - headers={"Accept": "application/json"}, - timeout=10, - ) - if not r.ok: - logger.error("Search engine error: %s", r.text) - raise HTTPException(r.status_code, "Search engine error.") - return r + if not seq: + logger.error("Empty FASTA sequence provided.") + return None + + # UniProtKB/Swiss-Prot BLAST API + try: + result_handle = NCBIWWW.qblast( + program="blastp", + database="swissprot", + sequence=seq, + hitlist_size=1, + expect=threshold, + ) + blast_record = NCBIXML.read(result_handle) + except Exception as e: # pylint: disable=broad-except + logger.error("BLAST search failed: %s", e) + return None + + if not blast_record.alignments: + logger.info("No BLAST hits found for the given sequence.") + return None + + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + logger.info("No BLAST hits below the threshold E-value.") + return None + hit_id = best_alignment.hit_id + + # like sp|P01308.1|INS_HUMAN + accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id + return self.get_by_accession(accession) diff --git a/graphgen/operators/build_kg/build_kg.py b/graphgen/operators/build_kg/build_kg.py index 1f94d2e..8945826 100644 --- a/graphgen/operators/build_kg/build_kg.py +++ b/graphgen/operators/build_kg/build_kg.py @@ -30,7 +30,12 @@ async def build_kg( """ text_chunks = [chunk for chunk in chunks if chunk.type == "text"] - mm_chunks = [chunk for chunk in chunks if chunk.type != "text"] + mm_chunks = [ + chunk + for chunk in chunks + if chunk.type in ("image", "video", "table", "formula") + ] + mo_chunks = [chunk for chunk in chunks if chunk.type in ("genome", "protein")] if len(text_chunks) == 0: logger.info("All text chunks are already in the storage") @@ -42,6 +47,7 @@ async def build_kg( chunks=text_chunks, progress_bar=progress_bar, ) + if len(mm_chunks) == 0: logger.info("All multi-modal chunks are already in the storage") else: @@ -53,16 +59,15 @@ async def build_kg( progress_bar=progress_bar, ) - if anchor_type is not None: - logger.info("Anchoring data based on %s ...", anchor_type) - if anchor_type == "protein": - await build_mo_kg( - llm_client=llm_client, - kg_instance=kg_instance, - chunks=text_chunks, - progress_bar=progress_bar, - ) - else: - logger.error("Anchor type %s is not supported yet.", anchor_type) + if len(mo_chunks) == 0: + logger.info("All multi-omics chunks are already in the storage") + else: + logger.info("[Multi-omics Entity and Relation Extraction] processing ...") + await build_mo_kg( + llm_client=llm_client, + kg_instance=kg_instance, + chunks=mo_chunks, + progress_bar=progress_bar, + ) return kg_instance diff --git a/graphgen/operators/build_kg/build_mo_kg.py b/graphgen/operators/build_kg/build_mo_kg.py index a98ee54..046b173 100644 --- a/graphgen/operators/build_kg/build_mo_kg.py +++ b/graphgen/operators/build_kg/build_mo_kg.py @@ -1,4 +1,4 @@ -import json +from collections import defaultdict from typing import List import gradio as gr @@ -6,8 +6,8 @@ from graphgen.bases import BaseLLMWrapper from graphgen.bases.base_storage import BaseGraphStorage from graphgen.bases.datatypes import Chunk -from graphgen.templates import PROTEIN_ANCHOR_PROMPT, PROTEIN_KG_EXTRACTION_PROMPT -from graphgen.utils import detect_main_language, logger, run_concurrent +from graphgen.models import MOKGBuilder +from graphgen.utils import run_concurrent async def build_mo_kg( @@ -25,53 +25,34 @@ async def build_mo_kg( :return: """ - async def extract_mo_info(chunk: Chunk): - content = chunk.content - language = detect_main_language(content) - prompt = PROTEIN_ANCHOR_PROMPT[language].format(chunk=content) - result = await llm_client.generate_answer(prompt) - try: - json_result = json.loads(result) - return json_result - except json.JSONDecodeError: - logger.warning("Failed to parse JSON from LLM response: %s", result) - return {} + mo_builder = MOKGBuilder(llm_client=llm_client) results = await run_concurrent( - extract_mo_info, + mo_builder.extract, chunks, - desc="Extracting multi-omics anchoring information from chunks", + desc="[2/4] Extracting entities and relationships from multi-omics chunks", unit="chunk", progress_bar=progress_bar, ) - # Merge results - from collections import defaultdict - bags = defaultdict(set) - for item in results: - for k, v in item.items(): - if v is not None and str(v).strip(): - bags[k].add(str(v).strip()) - - merged = { - k: " | ".join(sorted(v)) if len(v) > 1 else next(iter(v)) - for k, v in bags.items() - } - - # TODO: search database for more info - # try: - # search_results = await search(merged["Protein accession or ID"]) - # except Exception as e: - # logger.warning("Failed to search for protein info: %s", e) - # search_results = {} + nodes = defaultdict(list) + edges = defaultdict(list) + for n, e in results: + for k, v in n.items(): + nodes[k].extend(v) + for k, v in e.items(): + edges[tuple(sorted(k))].extend(v) + + await run_concurrent( + lambda kv: mo_builder.merge_nodes(kv, kg_instance=kg_instance), + list(nodes.items()), + desc="Inserting entities into storage", + ) - mo_text = "\n".join([f"{k}: {v}" for k, v in merged.items()]) - lang = detect_main_language(mo_text) - prompt = PROTEIN_KG_EXTRACTION_PROMPT[lang].format( - input_text=mo_text, - **PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"], + await run_concurrent( + lambda kv: mo_builder.merge_edges(kv, kg_instance=kg_instance), + list(edges.items()), + desc="Inserting relationships into storage", ) - kg_output = await llm_client.generate_answer(prompt) - print(kg_output) - # TODO: parse kg_output and insert into kg_instance + return kg_instance diff --git a/graphgen/operators/search/db/__init__.py b/graphgen/operators/search/db/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/graphgen/operators/search/db/search_uniprot.py b/graphgen/operators/search/db/search_uniprot.py deleted file mode 100644 index e69de29..0000000 diff --git a/graphgen/operators/search/multi_omics_search.py b/graphgen/operators/search/multi_omics_search.py new file mode 100644 index 0000000..fbe10f0 --- /dev/null +++ b/graphgen/operators/search/multi_omics_search.py @@ -0,0 +1,29 @@ +import re +from typing import Dict, Optional + +from graphgen.models import UniProtSearch + + +def _fetch_uniprot(entry: str) -> Optional[Dict]: + entry = entry.strip() + client = UniProtSearch() + + # 1. first try accession search + if re.fullmatch( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry + ): + return client.get_by_accession(entry) + + # 2. then try keyword search + return client.get_best_hit(entry) + + +def multi_omics_search(entry: str) -> Dict: + """ + Multi-omics search function that tries to fetch protein/gene information. + """ + # TODO: Extend this function to include more omics databases as needed. + result = _fetch_uniprot(entry) + if result: + return {"input": entry, "uniprot": result} + return {"input": entry, "uniprot": None} diff --git a/requirements.txt b/requirements.txt index 82740f0..2572808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,36 @@ -tqdm -openai -python-dotenv -numpy -networkx +tqdm~=4.67.1 +openai~=1.99.1 +python-dotenv~=1.0.1 +numpy~=2.2.6 +networkx~=3.4.2 graspologic -tiktoken -pyecharts -wikipedia -tenacity -nltk -jieba -plotly -pandas +tiktoken~=0.8.0 +pyecharts~=2.0.7 +wikipedia~=1.4.0 +tenacity~=9.0.0 +nltk~=3.9.1 +jieba~=0.42.1 +plotly~=5.24.1 +pandas~=2.2.3 gradio>=5.44.1 kaleido -pyyaml -langcodes -requests -fastapi -trafilatura +pyyaml~=6.0.2 +langcodes~=3.4.1 +requests~=2.32.4 +fastapi~=0.115.6 +trafilatura~=2.0.0 -leidenalg -igraph +leidenalg~=0.10.2 +igraph~=0.11.9 python-louvain # For visualization -matplotlib +matplotlib~=3.10.7 + +pytest~=8.4.1 +rich~=13.9.4 +aiohttp~=3.12.9 +biopython~=1.85 +transformers~=4.57.1 +torch~=2.8.0 +setuptools~=75.1.0 \ No newline at end of file diff --git a/resources/input_examples/protein_qa_demo.json b/resources/input_examples/protein_qa_demo.json new file mode 100644 index 0000000..49199f3 --- /dev/null +++ b/resources/input_examples/protein_qa_demo.json @@ -0,0 +1,64 @@ +[ + { + "type": "text", + "content": "The $4 4 - \\mathbf { k D }$ protein, named harpin, was electroeluted from a preparative SDS-polyacrylamide gel (12). At concentrations ${ \\ge } 5 0 0 \\mathbf { n } \\mathbf { M }$ $( \\geq 2 5 ~ | \\mathbf { \\mu } \\mathbf { g } / \\mathbf { m l } )$ , harpin elicited HR in leaves of tobacco (Fig. 2, sectors 6 and " + }, + { + "type": "text", + "content": "Because supernatants from E. amylovora Ea321(pCPP430) or E. coli DH5α (pCPP430) did not elicit HR, we postulated that harpin was not secreted but rather was present in or on the bacteria. Whole bacteria treated with protease failed to elicit HR, whereas bacteria incubated with protease together with $0 . 5 ~ \\mathrm { m M }$ phenylmethylsulfonyl fluoride (PMSF, a protease inhibitor) did (Table 1). Treatment of bacteria with increasing amounts of protease resulted in a decreased ability to elicit HR that correlated with the disappearance of harpin detectable in SDS-polyacrylamide gels (Table 1). After centrifugation of CFEP at $_ { 1 0 5 , 0 0 0 g }$ for 1 hour, most HR-eliciting activity was found in the supernatant. However, when the cell suspension was brought to $3 0 \\mathrm { \\ m M \\ M g C l } _ { 2 }$ ,before sonication, most activity was associated with the sedimented membrane fraction. Gel-permeation chromatography of unheated CFEP also indicated association of the elicitor with a high molecular weight $( > 1 0 ^ { 6 }$ daltons) fraction, probably membrane vesicles (14). Only the membrane fraction of E. amylovora Ea321(pCPP430) reacted with an antiserum raised in response to harpin (15), further supporting the cell-envelope location of harpin (Fig. 4). " + }, + { + "type": "text", + "content": "HR-eliciting activity, harpin was not detected. However, when the protease inhibitor PMSF $( 0 . 5 \\mathrm { \\ m M } )$ was included, the bacteria retained HR-eliciting activity and possessed detectable harpin for more than 2 hours. More protease was required per cell to destroy harpin produced by E. coli $\\mathsf { D H S } \\alpha ( \\mathsf { p C P P } 4 3 0 )$ than by Ea321(pCPP430), suggesting that E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ produces more harpin or degrades it more slowly, or both. " + }, + { + "type": "text", + "content": "The ability of bacterial strains to elicit the HR in intact tobacco leaves is related genetically to their ability to elicit a $\\mathbf { K } ^ { + } / \\mathbf { H } ^ { + }$ exchange reaction (XR) in tobacco cell suspension cultures (TCSCs) (16); both reactions require the hrp gene cluster (17). " + }, + { + "type": "text", + "content": "We tested the ability of harpin to raise the pH of TCSC bathing solution, an indicator of the XR (Fig. 5). Cells of E. amylovora, grown in rich medium and added to TCSCs caused an increase in pH of the bathing solution after 2 to 3 hours. Addition of purified harpin caused an increase in pH within 1 hour. Erwinia amylovora mutant Ea321K49, which did not produce harpin in culture, and strains of E. coli containing mutated hrp gene clusters failed to elicit the XR. " + }, + { + "type": "text", + "content": "Table 1. Protease sensitivity of the HR-eliciting activity of whole cells of E. amylovora Ea321(pCPP430). Cells were grown in LB medium, harvested'by centrifugation, and resuspended in 0.1 volume of $5 m M$ potassium phosphate $( \\mathsf { p H } \\thinspace 6 . 5 )$ containing tetracycline (40 $\\mu { \\sf g } / { \\sf m } 1 )$ . After incubation with protease (Sigma P5147), as indicated, at $\\mathfrak { s } 7 ^ { \\circ } \\mathfrak { C }$ for 5 min, $1 0 0 ~ \\mu !$ of each cell suspension was infiltrated into tobacco leaves. Leaf sector collapse was assayed at 24 hours. At the time of infiltration, portions of protease-treated cell mixtures were iysed, held'in boiling water for 10 min, centrifuged for 10 min at $1 2 . 0 0 0 g .$ and electrophoresed on a $10 \\%$ SDS-polyacrylamide gel to detect harpin. Electrophoresis was done for 2 hours at $1 5 m \\mathsf { A }$ followed by staining with Coomassie blue R-250. Cell-free supernatant, produced from the LB culture, was filter-sterilized and then concentrated with the Centriprep-10 (Amicon, Danvers, Massachusetts). " + }, + { + "type": "table", + "img_path": "resources/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg", + "table_caption": [], + "table_footnote": [], + "table_body": "
Protease per milliterTissue collapseHarpin detected
0++
5μg++
10μg++
20 μgWeak+
40 μg-
80μg
80μg + 0.5 mM PMSF++
Cell-free supernatant
" + }, + { + "type": "text", + "content": "expressed fom pCPP1084 in the T7RNA (20). Insertions of Tn5tac1 in hrpN (21) (Fig. 1) abolished the ability of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ to elicit HR on tobacco or produce harpin detectable on Western blots. Ea321T5, a derivative of E. amylo" + }, + { + "type": "text", + "content": "at $\\pmb { 1 0 0 ^ { \\circ } } \\pmb { \\mathbb { C } }$ for 10 min; 8, CFEP $( 5 ~ \\mu 9 )$ from E. coli DH5a(pCPP430) treated at $1 0 0 ^ { \\circ } \\mathsf { C }$ for 10 min; 9, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321K49 treated at $_ { 1 0 0 ^ { \\circ } \\mathbb { C } }$ for 10 min. Samples from the preparations in lanes 3, 4, 7, and 8 elicited HR in tobacco leaves. Samples were prepared as described (8) and brought to 125 mM tris-HCI $( \\mathsf { p H } 6 . 8 )$ $4 \\%$ SDS, $20 \\%$ glycerol, boiled for 3 min, then electrophoresed through a $10 \\%$ (w/v) polyacrylamide gel with $0 . 1 \\%$ SDS at $1 5 m A$ for 2 hours in a Mighty Small apparatus according to instructions (Hoefer Scientific Instruments, San Francisco, California). The gel was stained with $0 . 0 2 5 \\%$ Coomassie Blue R-250. Low-range prestained molecular weight standards (Bio-Rad 161-0305) were used and calibrated with an unstained protein marker (Bio-Rad 161-0304). Arrow indicates region corresponding to $4 4 \\ k \\mathsf$ " + }, + { + "type": "text", + "content": "DNA sequence data from the $1 . 3 – \\mathbf { k } \\mathbf { b }$ Hind II fragment revealed that hrpN is 1155 base pairs long, and it encodes a 385–amino acid protein (Fig. 1). The 15 $\\mathrm { N H } _ { 2 }$ -terminal residues revealed by amino acid sequencing corresponded to those deduced from the DNA' sequence (Fig. 1). The deduced amino acid sequence of harpin (Fig. 1), which corresponded closely with the analyzed amino acid composition, reveals a glycine-rich protein with a high degree of hydrophilicity. It appears to have an open structure, which may explain its heat stability and sensitivity to proteases. A FASTA search (23) of GenBank for similar proteins revealed similarity only with other glycine-rich proteins, such as several plant cell wall proteins and keratins. " + }, + { + "type": "protein", + "protein_caption": { + "protein name": "harpin", + "gene name from literature": "hrpN", + "source organism": "Erwinia amylovora", + "primary biological role": "elicitor of the plant defense reaction known as the hypersensitive response", + "subcellular localization (active site)": "cell-envelope-associated", + "inducible by": null, + "post-translational modifications": null, + "literature_name": "Harpin, Elicitor of the Hypersensitive Response Produced by the Plant Pathogen Erwinia amylovora", + "experimental evidence": "Harpin caused tobacco leaf lamina to collapse and caused an increase in the pH of bathing solutions of suspension-cultured tobacco cells.", + "gene names from uniprot database:": null, + "catalytic activity:": null, + "amino acids length:": 385, + "protein family:": null, + "protein sequence:": null + } + } +] diff --git a/resources/input_examples/protein_qa_demo.txt b/resources/input_examples/protein_qa_demo.txt deleted file mode 100644 index cbc60df..0000000 --- a/resources/input_examples/protein_qa_demo.txt +++ /dev/null @@ -1,53 +0,0 @@ -# Harpin, Elicitor of the Hypersensitive Response Produced by the Plant Pathogen Erwinia amylovora - -Zhong-Min Wei, Ron J. Laby, Cathy H. Zumoff, David W. Bauer, Sheng Yang He, Alan Collmer, Steven V. Beer\* - -A proteinaceous elicitor of the plant defense reaction known as the hypersensitive response was isolated from Erwinia amylovora, the bacterium that causes fire blight of pear, apple, and other rosaceous plants. The elicitor, named harpin, is an acidic, heat-stable, cell-envelope-associated protein with an apparent molecular weight of 44 kilodaltons. Harpin caused tobacco leaf lamina to collapse and caused an increase in the pH of bathing solutions of suspension-cultured tobacco cells. The gene encoding harpin (hrpN) was located in the 40-kilobase hrp gene cluster of E. amylovora, sequenced, and mutated with Tn5tac1. The hrpN mutants were not pathogenic to pear, did not elicit the hypersensitive response, and did not produce harpin. - -The hypersensitive response (HR) of higher plants is characterized by the rapid, localized death of tissues containing an incompatible pathogen (a microorganism that is pathogenic only on other plants). It is associated with the defense of plants against many bacteria, fungi, nematodes, and viruses (1). The molecular basis for HR is unknown, but physiological and genetic observations with bacteria suggest that the same factor that elicits HR in nonhosts also is required for pathogenicity in hosts (1, 2). Production of this factor is controlled by hrp genes, which are highly conserved among many species of plant pathogenic bacteria (2, 3). Functional clusters of hrp genes have been cloned from Erwinia amylovora and Pseudomonas syringae and have been shown to confer on nonpathogenic bacteria the ability to elicit HR after infltration of bacterial suspensions into the intercellular spaces of leaves of tobacco and other plants (4, 5). - -The hrp gene cluster from E. amylovora (Fig: 1), contained in the cosmid pCPP430, is expressed particularly well in Escherichia coli (4). We report here the isolation of a proteinaceous elicitor of the HR from Escherichia coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ and from E. amylovora, the bacterium that causes the devastating disease of apple, pear, and other rosaceous plants known as fire blight (6). We propose the name harpin for the HR-elicitor from E. amylovora and hrpN for the gene that encodes it. - -Tobacco leaves infiltrated (7) with cellfree culture supernatants of E. amylovora Ea321, Ea321(pCPP430) or E. coli $\mathrm { \bar { \ D H } } 5 \mathbf { \alpha }$ (pCPP430) showed no HR, whereas leaves infiltrated with a centrifuged and filtersterilized preparation from sonicated cells of E. coli $\bar { \mathsf { D H } } \bar { 5 } \alpha ( \mathsf { p C P P } 4 3 0 )$ showed a strong HR within 12 hours (Fig. 2). The HReliciting activity of this cell-free elicitor - -13). Harpin also elicited HR in tomato and Arabidopsis thaliana. Purified harpin was protease-sensitive, heat-stable, and had a pI of 4.3 on thin-layer IEF gels (13). - -preparation (CFEP) (8) was heat-stable but highly sensitive to protease (9). - -Further purifcation was achieved by holding the CFEP in a boiling-water bath for $1 0 ~ \mathrm { \ m i n }$ and removing the insoluble material by centrifugation. The remaining soluble material was electrophoresed on an SDS-polyacrylamide gel (Fig. 3). A protein band corresponding to ${ 4 4 \ k D }$ was uniquely present in all preparations with HR-eliciting activity (Fig. 3). Similarly, after resolution of the CFEP on an isoelectric-focusing (IEF) granulated gel bed (10) or by ionexchange chromatography (1 1), the fractions with HR-eliciting activity always contained a $4 4 - \mathbf { k D }$ protein. - -The $4 4 - \mathbf { k D }$ protein, named harpin, was electroeluted from a preparative SDS-polyacrylamide gel (12). At concentrations ${ \ge } 5 0 0 \mathbf { n } \mathbf { M }$ $( \geq 2 5 ~ | \mathbf { \mu } \mathbf { g } / \mathbf { m l } )$ , harpin elicited HR in leaves of tobacco (Fig. 2, sectors 6 and - -Because supernatants from E. amylovora Ea321(pCPP430) or E. coli DH5α (pCPP430) did not elicit HR, we postulated that harpin was not secreted but rather was present in or on the bacteria. Whole bacteria treated with protease failed to elicit HR, whereas bacteria incubated with protease together with $0 . 5 ~ \mathrm { m M }$ phenylmethylsulfonyl fluoride (PMSF, a protease inhibitor) did (Table 1). Treatment of bacteria with increasing amounts of protease resulted in a decreased ability to elicit HR that correlated with the disappearance of harpin detectable in SDS-polyacrylamide gels (Table 1). After centrifugation of CFEP at $_ { 1 0 5 , 0 0 0 g }$ for 1 hour, most HR-eliciting activity was found in the supernatant. However, when the cell suspension was brought to $3 0 \mathrm { \ m M \ M g C l } _ { 2 }$ ,before sonication, most activity was associated with the sedimented membrane fraction. Gel-permeation chromatography of unheated CFEP also indicated association of the elicitor with a high molecular weight $( > 1 0 ^ { 6 }$ daltons) fraction, probably membrane vesicles (14). Only the membrane fraction of E. amylovora Ea321(pCPP430) reacted with an antiserum raised in response to harpin (15), further supporting the cell-envelope location of harpin (Fig. 4). - -Cell suspensions of $\mathtt { E a } 3 2 1 ( \mathsf { p C P P 4 } 3 0 )$ or $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ in log-phase maintained their HR-eliciting activity for less than 0.5 hour and 1 hour, respectively, in the presence of tetracycline $( 4 0 ~ \mu \mathrm { g } / \mathrm { m l } )$ , an inhibitor of protein synthesis. Once cells lost - -insertions to the left of hrpN abolish pathogenicity to pear and eliminate or greatly disrupt HR-elicitation. Flags indicate the orientation of the outward-facing $P _ { \mathrm { t a c } }$ promoter of Tn5tac1 $( 2 8 )$ in the tac1-1 and tac1-2 insertions. The two tac1 and K49 mutations were marker-exchanged into the E. amylovora 321 genome. The mutations abolished the ability of E. amylovora to cayse disease in immature pear fruit, to elicit HR in tobacco leaves, and to produce harpin; similarly, they abolished the ability of E. coli DH5a(pCPP43o) to elicit HR and to produce harpin. The effect of the K49 mutation on harpin production probably is regulatory. (B) Deduced amino acid sequence of harpin. The DNA sequence is accessioned in GenBank (#M92994). Amino acids confirmed by $N H _ { 2 } -$ terminal sequence analysis of the purified protein are underlined. - -HR-eliciting activity, harpin was not detected. However, when the protease inhibitor PMSF $( 0 . 5 \mathrm { \ m M } )$ was included, the bacteria retained HR-eliciting activity and possessed detectable harpin for more than 2 hours. More protease was required per cell to destroy harpin produced by E. coli $\mathsf { D H S } \alpha ( \mathsf { p C P P } 4 3 0 )$ than by Ea321(pCPP430), suggesting that E. coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ produces more harpin or degrades it more slowly, or both. - -The ability of bacterial strains to elicit the HR in intact tobacco leaves is related genetically to their ability to elicit a $\mathbf { K } ^ { + } / \mathbf { H } ^ { + }$ exchange reaction (XR) in tobacco cell suspension cultures (TCSCs) (16); both reactions require the hrp gene cluster (17). - -We tested the ability of harpin to raise the pH of TCSC bathing solution, an indicator of the XR (Fig. 5). Cells of E. amylovora, grown in rich medium and added to TCSCs caused an increase in pH of the bathing solution after 2 to 3 hours. Addition of purified harpin caused an increase in pH within 1 hour. Erwinia amylovora mutant Ea321K49, which did not produce harpin in culture, and strains of E. coli containing mutated hrp gene clusters failed to elicit the XR. - -Harpin also was isolated from E. amylovora Ea321, which had been preincubated for 5 hours in an HR-inducing medium (18). Harpin from E. amylovora Ea321 was identical in physical and biological properties to that isolated from E. coli $\mathrm { \bar { D H } } 5 \mathbf { \alpha }$ (pCPP430). No protease-sensitive, heatstable, HR-eliciting activity, associated with a 44-kD protein, was seen in cell-free extracts from E. coli $ { \mathbf Ḋ \mathrm { H } } 5 { \mathbf Ḋ \mathrm { d } } ( { \mathbf Ḋ \mathrm { C P P } } 9 ) $ , which harbors the vector of pCPP430 (Fig. 3, lane 5). On the basis of the visual intensities of the 44-kD bands on SDS-polyacrylamide gels (Fig. 3, lanes 7 and 8), we estimate that Ea321 produces about $10 \%$ as much harpin as does E. coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ . - -The structural gene encoding harpin, designated hrpN, was located within the hrp gene cluster (Fig. 1) by hybridization with an oligonucleotide probe corresponding to the ninth to the fifteenth amino acid residues from the $\mathrm { N H } _ { 2 }$ -terminus of harpin (19) . The 1.3-kb Hind 'II fragment that hybridized was cloned into pBluescript $\ M \ 1 3 +$ (Stratagene, La Jolla, California) and designated pCPP1084. A unique $4 4 - \mathbf { k D }$ protein, which was immunoprecipitated by antiserum raised against harpin (15), was - -Table 1. Protease sensitivity of the HR-eliciting activity of whole cells of E. amylovora Ea321(pCPP430). Cells were grown in LB medium, harvested'by centrifugation, and resuspended in 0.1 volume of $5 m M$ potassium phosphate $( \mathsf { p H } \thinspace 6 . 5 )$ containing tetracycline (40 $\mu { \sf g } / { \sf m } 1 )$ . After incubation with protease (Sigma P5147), as indicated, at $\mathfrak { s } 7 ^ { \circ } \mathfrak { C }$ for 5 min, $1 0 0 ~ \mu !$ of each cell suspension was infiltrated into tobacco leaves. Leaf sector collapse was assayed at 24 hours. At the time of infiltration, portions of protease-treated cell mixtures were iysed, held'in boiling water for 10 min, centrifuged for 10 min at $1 2 . 0 0 0 g .$ and electrophoresed on a $10 \%$ SDS-polyacrylamide gel to detect harpin. Electrophoresis was done for 2 hours at $1 5 m \mathsf { A }$ followed by staining with Coomassie blue R-250. Cell-free supernatant, produced from the LB culture, was filter-sterilized and then concentrated with the Centriprep-10 (Amicon, Danvers, Massachusetts). - -
Protease per milliterTissue collapseHarpin detected
0++
5μg++
10μg++
20 μgWeak+
40 μg-
80μg
80μg + 0.5 mM PMSF++
Cell-free supernatant
- -expressed fom pCPP1084 in the T7RNA (20). Insertions of Tn5tac1 in hrpN (21) (Fig. 1) abolished the ability of E. coli $\mathsf { D H } 5 \alpha ( \mathsf { p C P P } 4 3 0 )$ to elicit HR on tobacco or produce harpin detectable on Western blots. Ea321T5, a derivative of E. amylo - -at $\pmb { 1 0 0 ^ { \circ } } \pmb { \mathbb { C } }$ for 10 min; 8, CFEP $( 5 ~ \mu 9 )$ from E. coli DH5a(pCPP430) treated at $1 0 0 ^ { \circ } \mathsf { C }$ for 10 min; 9, CFEP $( 5 ~ \mu 9 )$ from E. amylovora Ea321K49 treated at $_ { 1 0 0 ^ { \circ } \mathbb { C } }$ for 10 min. Samples from the preparations in lanes 3, 4, 7, and 8 elicited HR in tobacco leaves. Samples were prepared as described (8) and brought to 125 mM tris-HCI $( \mathsf { p H } 6 . 8 )$ $4 \%$ SDS, $20 \%$ glycerol, boiled for 3 min, then electrophoresed through a $10 \%$ (w/v) polyacrylamide gel with $0 . 1 \%$ SDS at $1 5 m A$ for 2 hours in a Mighty Small apparatus according to instructions (Hoefer Scientific Instruments, San Francisco, California). The gel was stained with $0 . 0 2 5 \%$ Coomassie Blue R-250. Low-range prestained molecular weight standards (Bio-Rad 161-0305) were used and calibrated with an unstained protein marker (Bio-Rad 161-0304). Arrow indicates region corresponding to $4 4 \ k \mathsf$ - -HR in tobacco leaves or to cause fire blight in highly susceptible immature pear fruits (22). Both pathogenic and HR-eliciting abilities were restored to Ea321T5 by pCPP1084, in trans, and the resulting strains produced harpin. - -DNA sequence data from the $1 . 3 – \mathbf { k } \mathbf { b }$ Hind II fragment revealed that hrpN is 1155 base pairs long, and it encodes a 385–amino acid protein (Fig. 1). The 15 $\mathrm { N H } _ { 2 }$ -terminal residues revealed by amino acid sequencing corresponded to those deduced from the DNA' sequence (Fig. 1). The deduced amino acid sequence of harpin (Fig. 1), which corresponded closely with the analyzed amino acid composition, reveals a glycine-rich protein with a high degree of hydrophilicity. It appears to have an open structure, which may explain its heat stability and sensitivity to proteases. A FASTA search (23) of GenBank for similar proteins revealed similarity only with other glycine-rich proteins, such as several plant cell wall proteins and keratins. - -The properties of the E. amylovora harpin protein are consistent with numerous physiological observations that were made after the discovery in 1963 that bacteria can elicit HR (24). These have indicated that elicitation of HR requires de novo protein synthesis and production of a labile factor by bacteria in close proximity to plant cells, and that each bacterial cell typically kills only one plant cell, thus explaining the requirement for high numbers of bacteria to cause death of enough plant cells to produce macroscopically visible collapse of the tissue (1, 25). - -The nonpathogenic phenotype of the hrpN mutation in Ea321T5 also is characteristic of hrp mutations in other phytopathogenic bacteria (2) and indicates that harpin is a primary determinant of pathogenicity in E. amylovora. That harpin has an essential role in both disease and plant defense reactions is puzzling but may be based on differential proteolysis or differential expression of hrp genes in host and nonhost plants (18). - -Toxins, plant cell wall-degrading enzymes, and phytohormones contribute to the virulence (degree of pathogenicity) of certain members of the important group of phytopathogenic bacteria that possess limited host ranges and produce necrotic lesions after multiplication in compatible hosts (26). The hrp genes, in contrast, are absolutely required for pathogenicity by these bacteria, which include species of Erwinia, Pseudomonas, and Xanthomonas. The conservation of the hrp genes suggests that the E. amylovora harpin may be the archetypical disease determinant for these pathogens. From 51c12ce8fb04e5eacbd2697a389eec43e53c1450 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 3 Nov 2025 15:39:59 +0800 Subject: [PATCH 7/8] chore: downgrade numpy in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2572808..cf19eeb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ tqdm~=4.67.1 openai~=1.99.1 python-dotenv~=1.0.1 -numpy~=2.2.6 +numpy<1.26 networkx~=3.4.2 graspologic tiktoken~=0.8.0 From fa6e32a08ea5b5c9a0c9a3045e7fba374f6bf3f7 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Mon, 3 Nov 2025 15:48:03 +0800 Subject: [PATCH 8/8] fix: fix dependencies --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cf19eeb..223dfb5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ tqdm~=4.67.1 openai~=1.99.1 python-dotenv~=1.0.1 -numpy<1.26 +numpy~=2.2.6 networkx~=3.4.2 graspologic tiktoken~=0.8.0 @@ -19,6 +19,7 @@ langcodes~=3.4.1 requests~=2.32.4 fastapi~=0.115.6 trafilatura~=2.0.0 +gensim~=4.4.3 leidenalg~=0.10.2 igraph~=0.11.9