diff --git a/Cargo.lock b/Cargo.lock index b13bf243ae..e49dd0c7b2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4913,6 +4913,7 @@ version = "0.17.0" dependencies = [ "ahash", "arrow", + "arrow-csv", "arrow-json", "arroy", "async-openai", diff --git a/Cargo.toml b/Cargo.toml index e4495c644f..c9477dab29 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -162,6 +162,7 @@ parquet = { version = "56.2.0" } arrow-json = { version = "56.2.0" } arrow-buffer = { version = "56.2.0" } arrow-schema = { version = "56.2.0" } +arrow-csv = { version = "56.2.0" } arrow-array = { version = "56.2.0", features = ["chrono-tz"] } arrow-cast = { version = "56.2.0" } arrow-ipc = { version = "56.2.0" } diff --git a/python/pyproject.toml b/python/pyproject.toml index 27e1952a0a..5d8310936a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -33,7 +33,7 @@ pyvis = ["pyvis >= 0.3.2"] networkx = ["networkx >= 2.6.3"] export = ["raphtory[pyvis,networkx]"] all = ["raphtory[export,plot]"] -dev = ["docstring_parser >= 0.16", "pandas-stubs", "maturin>=1.8.3", "tox>=4.25"] +dev = ["docstring_parser >= 0.16", "pandas-stubs", "pyarrow-stubs", "maturin>=1.8.3", "tox>=4.25"] test = ["raphtory[all]", "requests >= 2.31.0", "pyjwt[crypto] >= 2.10.1", "pytest >= 8", "pytest-benchmark >= 5.1.0", "polars >= 1.35.2", "fireducks; sys_platform != 'win32' and python_version < '3.14'", "duckdb >= 1.4.2"] tox = ["nbmake"] diff --git a/python/python/raphtory/__init__.pyi b/python/python/raphtory/__init__.pyi index 41e385b8bc..9b0a253104 100644 --- a/python/python/raphtory/__init__.pyi +++ b/python/python/raphtory/__init__.pyi @@ -1,7 +1,6 @@ """ Raphtory graph analytics library """ - from __future__ import annotations ############################################################################### @@ -24,56 +23,14 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = [ - "GraphView", - "Graph", - "PersistentGraph", - "Node", - "NodeFilterBuilder", - "Nodes", - "PathFromNode", - "PathFromGraph", - "MutableNode", - "Edge", - "Edges", - "NestedEdges", - "MutableEdge", - "Properties", - "PyPropValueList", - "Metadata", - "MetadataView", - "TemporalProperties", - "PropertiesView", - "TemporalProperty", - "EventTime", - "OptionalEventTime", - "History", - "HistoryTimestamp", - "HistoryDateTime", - "HistoryEventId", - "Intervals", - "WindowSet", - "IndexSpecBuilder", - "IndexSpec", - "version", - "graphql", - "algorithms", - "graph_loader", - "graph_gen", - "vectors", - "node_state", - "filter", - "iterables", - "nullmodels", - "plottingutils", -] - -class GraphView(object): +__all__ = ['GraphView', 'Graph', 'PersistentGraph', 'Node', 'NodeFilterBuilder', 'Nodes', 'PathFromNode', 'PathFromGraph', 'MutableNode', 'Edge', 'Edges', 'NestedEdges', 'MutableEdge', 'Properties', 'PyPropValueList', 'PropType', 'Metadata', 'MetadataView', 'TemporalProperties', 'PropertiesView', 'TemporalProperty', 'EventTime', 'OptionalEventTime', 'History', 'HistoryTimestamp', 'HistoryDateTime', 'HistoryEventId', 'Intervals', 'WindowSet', 'IndexSpecBuilder', 'IndexSpec', 'version', 'graphql', 'algorithms', 'graph_loader', 'graph_gen', 'vectors', 'node_state', 'filter', 'iterables', 'nullmodels', 'plottingutils'] +class GraphView(object): """Graph view is a read-only version of a graph at a certain point in time.""" def __eq__(self, value): @@ -265,9 +222,7 @@ class GraphView(object): GraphView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -468,12 +423,7 @@ class GraphView(object): Properties: Properties paired with their names """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -610,14 +560,7 @@ class GraphView(object): GraphView: Returns the subgraph """ - def to_networkx( - self, - explode_edges: bool = False, - include_node_properties: bool = True, - include_edge_properties: bool = True, - include_update_history: bool = True, - include_property_history: bool = True, - ) -> nx.MultiDiGraph: + def to_networkx(self, explode_edges: bool = False, include_node_properties: bool = True, include_edge_properties: bool = True, include_update_history: bool = True, include_property_history: bool = True) -> nx.MultiDiGraph: """ Returns a graph with NetworkX. @@ -636,19 +579,7 @@ class GraphView(object): nx.MultiDiGraph: A Networkx MultiDiGraph. """ - def to_pyvis( - self, - explode_edges: bool = False, - edge_color: str = "#000000", - shape: str = "dot", - node_image: Optional[str] = None, - edge_weight: Optional[str] = None, - edge_label: Optional[str] = None, - colour_nodes_by_type: bool = False, - directed: bool = True, - notebook: bool = False, - **kwargs: Any, - ) -> pyvis.network.Network: + def to_pyvis(self, explode_edges: bool = False, edge_color: str = '#000000', shape: str = 'dot', node_image: Optional[str] = None, edge_weight: Optional[str] = None, edge_label: Optional[str] = None, colour_nodes_by_type: bool = False, directed: bool = True, notebook: bool = False, **kwargs: Any) -> pyvis.network.Network: """ Draw a graph with PyVis. Pyvis is a required dependency. If you intend to use this function make sure that you install Pyvis @@ -709,14 +640,7 @@ class GraphView(object): GraphView: The layered view """ - def vectorise( - self, - embedding: Callable[[list], list], - nodes: bool | str = True, - edges: bool | str = True, - cache: Optional[str] = None, - verbose: bool = False, - ) -> VectorisedGraph: + def vectorise(self, embedding: Callable[[list], list], nodes: bool | str = True, edges: bool | str = True, cache: Optional[str] = None, verbose: bool = False) -> VectorisedGraph: """ Create a VectorisedGraph from the current graph. @@ -752,7 +676,7 @@ class GraphView(object): Optional[int]: """ -class Graph(GraphView): +class Graph(GraphView): """ A temporal graph with event semantics. @@ -763,16 +687,10 @@ class Graph(GraphView): def __new__(cls, num_shards: Optional[int] = None) -> Graph: """Create and return a new object. See help(type) for accurate signature.""" - def __reduce__(self): ... - def add_edge( - self, - timestamp: TimeInput, - src: str | int, - dst: str | int, - properties: Optional[PropInput] = None, - layer: Optional[str] = None, - event_id: Optional[int] = None, - ) -> MutableEdge: + def __reduce__(self): + ... + + def add_edge(self, timestamp: TimeInput, src: str|int, dst: str|int, properties: Optional[PropInput] = None, layer: Optional[str] = None, event_id: Optional[int] = None) -> MutableEdge: """ Adds a new edge with the given source and destination nodes and properties to the graph. @@ -805,14 +723,7 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def add_node( - self, - timestamp: TimeInput, - id: str | int, - properties: Optional[PropInput] = None, - node_type: Optional[str] = None, - event_id: Optional[int] = None, - ) -> MutableNode: + def add_node(self, timestamp: TimeInput, id: str|int, properties: Optional[PropInput] = None, node_type: Optional[str] = None, event_id: Optional[int] = None) -> MutableNode: """ Adds a new node with the given id and properties to the graph. @@ -830,12 +741,7 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def add_properties( - self, - timestamp: TimeInput, - properties: PropInput, - event_id: Optional[int] = None, - ) -> None: + def add_properties(self, timestamp: TimeInput, properties: PropInput, event_id: Optional[int] = None) -> None: """ Adds properties to the graph. @@ -912,14 +818,7 @@ class Graph(GraphView): None: """ - def create_node( - self, - timestamp: TimeInput, - id: str | int, - properties: Optional[PropInput] = None, - node_type: Optional[str] = None, - event_id: Optional[int] = None, - ) -> MutableNode: + def create_node(self, timestamp: TimeInput, id: str|int, properties: Optional[PropInput] = None, node_type: Optional[str] = None, event_id: Optional[int] = None) -> MutableNode: """ Creates a new node with the given id and properties to the graph. It fails if the node already exists. @@ -949,7 +848,7 @@ class Graph(GraphView): Graph: """ - def edge(self, src: str | int, dst: str | int) -> MutableEdge: + def edge(self, src: str|int, dst: str|int) -> MutableEdge: """ Gets the edge with the specified source and destination nodes @@ -1042,9 +941,7 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def import_edges_as( - self, edges: List[Edge], new_ids: List[Tuple[int, int]], merge: bool = False - ) -> None: + def import_edges_as(self, edges: List[Edge], new_ids: List[Tuple[int, int]], merge: bool = False) -> None: """ Import multiple edges into the graph with new ids. @@ -1079,9 +976,7 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def import_node_as( - self, node: Node, new_id: str | int, merge: bool = False - ) -> MutableNode: + def import_node_as(self, node: Node, new_id: str|int, merge: bool = False) -> MutableNode: """ Import a single node into the graph with new id. @@ -1116,9 +1011,7 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def import_nodes_as( - self, nodes: List[Node], new_ids: List[str | int], merge: bool = False - ) -> None: + def import_nodes_as(self, nodes: List[Node], new_ids: List[str|int], merge: bool = False) -> None: """ Import multiple nodes into the graph with new ids. @@ -1163,20 +1056,12 @@ class Graph(GraphView): Graph: the loaded graph with initialised cache """ - def load_edge_metadata_from_df( - self, - data: Any, - src: str, - dst: str, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: + def load_edge_metadata(self, data: Any, src: str, dst: str, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, layer: Optional[str] = None, layer_col: Optional[str] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing edge information. @@ -1186,6 +1071,8 @@ class Graph(GraphView): shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. layer (str, optional): The edge layer name. Defaults to None. layer_col (str, optional): The edge layer column name in a dataframe. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -1194,91 +1081,25 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def load_edge_props_from_pandas( - self, - df: DataFrame, - src: str, - dst: str, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edge properties from a Pandas DataFrame. - - Arguments: - df (DataFrame): The Pandas DataFrame containing edge information. - src (str): The column name for the source node. - dst (str): The column name for the destination node. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): The edge layer name. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edge_props_from_parquet( - self, - parquet_path: str, - src: str, - dst: str, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edge properties from parquet file - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files path containing edge information. - src (str): The column name for the source node. - dst (str): The column name for the destination node. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): The edge layer name. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edges_from_df( - self, - data: Any, - time: str, - src: str, - dst: str, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: + def load_edges(self, data: Any, time: str, src: str, dst: str, properties: Optional[List[str]] = None, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, layer: Optional[str] = None, layer_col: Optional[str] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing the edges. time (str): The column name for the update timestamps. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. + src (str): The column name for the source node IDs. + dst (str): The column name for the destination node IDs. properties (List[str], optional): List of edge property column names. Defaults to None. metadata (List[str], optional): List of edge metadata column names. Defaults to None. shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. layer_col (str, optional): The edge layer column name in a dataframe. Cannot be used in combination with layer. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -1287,72 +1108,6 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def load_edges_from_pandas( - self, - df: DataFrame, - time: str, - src: str, - dst: str, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edges from a Pandas DataFrame into the graph. - - Arguments: - df (DataFrame): The Pandas DataFrame containing the edges. - time (str): The column name for the update timestamps. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. - properties (List[str], optional): List of edge property column names. Defaults to None. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edges_from_parquet( - self, - parquet_path: str, - time: str, - src: str, - dst: str, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edges from a Parquet file into the graph. - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files path containing edges - time (str): The column name for the update timestamps. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. - properties (List[str], optional): List of edge property column names. Defaults to None. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - @staticmethod def load_from_file(path: str) -> Graph: """ @@ -1365,19 +1120,12 @@ class Graph(GraphView): Graph: """ - def load_node_metadata_from_df( - self, - data: Any, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: + def load_node_metadata(self, data: Any, id: str, node_type: Optional[str] = None, node_type_col: Optional[str] = None, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing node information. @@ -1386,6 +1134,8 @@ class Graph(GraphView): node_type_col (str, optional): The node type column name in a dataframe. Cannot be used in combination with node_type. Defaults to None. metadata (List[str], optional): List of node metadata column names. Defaults to None. shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -1394,75 +1144,12 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def load_node_props_from_pandas( - self, - df: DataFrame, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load node properties from a Pandas DataFrame. - - Arguments: - df (DataFrame): The Pandas DataFrame containing node information. - id(str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_node_props_from_parquet( - self, - parquet_path: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load node properties from a parquet file. - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files path containing node information. - id(str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_nodes_from_df( - self, - data: Any, - time: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: + def load_nodes(self, data: Any, time: str, id: str, node_type: Optional[str] = None, node_type_col: Optional[str] = None, properties: Optional[List[str]] = None, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing the nodes. @@ -1473,6 +1160,8 @@ class Graph(GraphView): properties (List[str], optional): List of node property column names. Defaults to None. metadata (List[str], optional): List of node metadata column names. Defaults to None. shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -1481,69 +1170,7 @@ class Graph(GraphView): GraphError: If the operation fails. """ - def load_nodes_from_pandas( - self, - df: DataFrame, - time: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load nodes from a Pandas DataFrame into the graph. - - Arguments: - df (DataFrame): The Pandas DataFrame containing the nodes. - time (str): The column name for the timestamps. - id (str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - properties (List[str], optional): List of node property column names. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_nodes_from_parquet( - self, - parquet_path: str, - time: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load nodes from a Parquet file into the graph. - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files containing the nodes - time (str): The column name for the timestamps. - id (str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - properties (List[str], optional): List of node property column names. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def node(self, id: str | int) -> MutableNode: + def node(self, id: str|int) -> MutableNode: """ Gets the node with the specified id @@ -1624,22 +1251,16 @@ class Graph(GraphView): None: """ -class PersistentGraph(GraphView): +class PersistentGraph(GraphView): """A temporal graph that allows edges and nodes to be deleted.""" def __new__(cls) -> PersistentGraph: """Create and return a new object. See help(type) for accurate signature.""" - def __reduce__(self): ... - def add_edge( - self, - timestamp: int, - src: str | int, - dst: str | int, - properties: Optional[PropInput] = None, - layer: Optional[str] = None, - event_id: Optional[int] = None, - ) -> None: + def __reduce__(self): + ... + + def add_edge(self, timestamp: int, src: str | int, dst: str | int, properties: Optional[PropInput] = None, layer: Optional[str] = None, event_id: Optional[int] = None) -> None: """ Adds a new edge with the given source and destination nodes and properties to the graph. @@ -1672,14 +1293,7 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def add_node( - self, - timestamp: TimeInput, - id: str | int, - properties: Optional[PropInput] = None, - node_type: Optional[str] = None, - event_id: Optional[int] = None, - ) -> None: + def add_node(self, timestamp: TimeInput, id: str | int, properties: Optional[PropInput] = None, node_type: Optional[str] = None, event_id: Optional[int] = None) -> None: """ Adds a new node with the given id and properties to the graph. @@ -1697,9 +1311,7 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def add_properties( - self, timestamp: TimeInput, properties: dict, event_id: Optional[int] = None - ) -> None: + def add_properties(self, timestamp: TimeInput, properties: dict, event_id: Optional[int] = None) -> None: """ Adds properties to the graph. @@ -1775,14 +1387,7 @@ class PersistentGraph(GraphView): None: """ - def create_node( - self, - timestamp: TimeInput, - id: str | int, - properties: Optional[PropInput] = None, - node_type: Optional[str] = None, - event_id: Optional[int] = None, - ) -> MutableNode: + def create_node(self, timestamp: TimeInput, id: str | int, properties: Optional[PropInput] = None, node_type: Optional[str] = None, event_id: Optional[int] = None) -> MutableNode: """ Creates a new node with the given id and properties to the graph. It fails if the node already exists. @@ -1800,14 +1405,7 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def delete_edge( - self, - timestamp: int, - src: str | int, - dst: str | int, - layer: Optional[str] = None, - event_id: Optional[int] = None, - ) -> MutableEdge: + def delete_edge(self, timestamp: int, src: str | int, dst: str | int, layer: Optional[str] = None, event_id: Optional[int] = None) -> MutableEdge: """ Deletes an edge given the timestamp, src and dst nodes and layer (optional). @@ -1920,9 +1518,7 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def import_edges_as( - self, edges: List[Edge], new_ids: list[Tuple[GID, GID]], merge: bool = False - ) -> None: + def import_edges_as(self, edges: List[Edge], new_ids: list[Tuple[GID, GID]], merge: bool = False) -> None: """ Import multiple edges into the graph with new ids. @@ -1959,9 +1555,7 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def import_node_as( - self, node: Node, new_id: str | int, merge: bool = False - ) -> Node: + def import_node_as(self, node: Node, new_id: str|int, merge: bool = False) -> Node: """ Import a single node into the graph with new id. @@ -1998,9 +1592,7 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def import_nodes_as( - self, nodes: List[Node], new_ids: List[str | int], merge: bool = False - ) -> None: + def import_nodes_as(self, nodes: List[Node], new_ids: List[str|int], merge: bool = False) -> None: """ Import multiple nodes into the graph with new ids. @@ -2034,19 +1626,12 @@ class PersistentGraph(GraphView): PersistentGraph: the loaded graph with initialised cache """ - def load_edge_deletions_from_df( - self, - data: Any, - time: str, - src: str, - dst: str, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: + def load_edge_deletions(self, data: Any, time: str, src: str, dst: str, layer: Optional[str] = None, layer_col: Optional[str] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load edge deletions into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load edge deletions into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing the edges. @@ -2055,6 +1640,8 @@ class PersistentGraph(GraphView): dst (str): The column name for the destination node ids. layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. layer_col (str, optional): The edge layer col name in the data source. Cannot be used in combination with layer. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value, if the operation is successful. @@ -2063,74 +1650,12 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def load_edge_deletions_from_pandas( - self, - df: DataFrame, - time: str, - src: str, - dst: str, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: + def load_edge_metadata(self, data: Any, src: str, dst: str, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, layer: Optional[str] = None, layer_col: Optional[str] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load edges deletions from a Pandas DataFrame into the graph. - - Arguments: - df (DataFrame): The Pandas DataFrame containing the edges. - time (str): The column name for the update timestamps. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. - layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edge_deletions_from_parquet( - self, - parquet_path: str, - time: str, - src: str, - dst: str, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edges deletions from a Parquet file into the graph. - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files path containing node information. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. - time (str): The column name for the update timestamps. - layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edge_metadata_from_df( - self, - data: Any, - src: str, - dst: str, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing edge information. @@ -2140,6 +1665,8 @@ class PersistentGraph(GraphView): shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. layer (str, optional): The edge layer name. Defaults to None. layer_col (str, optional): The edge layer column name in a dataframe. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -2148,91 +1675,25 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def load_edge_props_from_pandas( - self, - df: DataFrame, - src: str, - dst: str, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edge properties from a Pandas DataFrame. - - Arguments: - df (DataFrame): The Pandas DataFrame containing edge information. - src (str): The column name for the source node. - dst (str): The column name for the destination node. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): The edge layer name. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edge_props_from_parquet( - self, - parquet_path: str, - src: str, - dst: str, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edge properties from parquet file - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files path containing edge information. - src (str): The column name for the source node. - dst (str): The column name for the destination node. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): The edge layer name. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edges_from_df( - self, - data: Any, - time: str, - src: str, - dst: str, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: + def load_edges(self, data: Any, time: str, src: str, dst: str, properties: Optional[List[str]] = None, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, layer: Optional[str] = None, layer_col: Optional[str] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing the edges. time (str): The column name for the update timestamps. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. + src (str): The column name for the source node IDs. + dst (str): The column name for the destination node IDs. properties (List[str], optional): List of edge property column names. Defaults to None. metadata (List[str], optional): List of edge metadata column names. Defaults to None. shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. layer_col (str, optional): The edge layer column name in a dataframe. Cannot be used in combination with layer. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -2241,72 +1702,6 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def load_edges_from_pandas( - self, - df: DataFrame, - time: str, - src: str, - dst: str, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edges from a Pandas DataFrame into the graph. - - Arguments: - df (DataFrame): The Pandas DataFrame containing the edges. - time (str): The column name for the update timestamps. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. - properties (List[str], optional): List of edge property column names. Defaults to None. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_edges_from_parquet( - self, - parquet_path: str, - time: str, - src: str, - dst: str, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - layer: Optional[str] = None, - layer_col: Optional[str] = None, - ) -> None: - """ - Load edges from a Parquet file into the graph. - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files path containing edges - time (str): The column name for the update timestamps. - src (str): The column name for the source node ids. - dst (str): The column name for the destination node ids. - properties (List[str], optional): List of edge property column names. Defaults to None. - metadata (List[str], optional): List of edge metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - @staticmethod def load_from_file(path: str) -> PersistentGraph: """ @@ -2319,19 +1714,12 @@ class PersistentGraph(GraphView): PersistentGraph: """ - def load_node_metadata_from_df( - self, - data: Any, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: + def load_node_metadata(self, data: Any, id: str, node_type: Optional[str] = None, node_type_col: Optional[str] = None, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing node information. @@ -2340,6 +1728,8 @@ class PersistentGraph(GraphView): node_type_col (str, optional): The node type column name in a dataframe. Cannot be used in combination with node_type. Defaults to None. metadata (List[str], optional): List of node metadata column names. Defaults to None. shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -2348,75 +1738,12 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def load_node_props_from_pandas( - self, - df: DataFrame, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: + def load_nodes(self, data: Any, time: str, id: str, node_type: Optional[str] = None, node_type_col: Optional[str] = None, properties: Optional[List[str]] = None, metadata: Optional[List[str]] = None, shared_metadata: Optional[PropInput] = None, schema: Optional[list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str]] = None, csv_options: Optional[dict[str, str | bool]] = None) -> None: """ - Load node properties from a Pandas DataFrame. - - Arguments: - df (DataFrame): The Pandas DataFrame containing node information. - id(str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_node_props_from_parquet( - self, - parquet_path: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load node properties from a parquet file. - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files path containing node information. - id(str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_nodes_from_df( - self, - data: Any, - time: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). Arguments: data (Any): The data source containing the nodes. @@ -2427,6 +1754,8 @@ class PersistentGraph(GraphView): properties (List[str], optional): List of node property column names. Defaults to None. metadata (List[str], optional): List of node metadata column names. Defaults to None. shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. Returns: None: This function does not return a value if the operation is successful. @@ -2435,68 +1764,6 @@ class PersistentGraph(GraphView): GraphError: If the operation fails. """ - def load_nodes_from_pandas( - self, - df: DataFrame, - time: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load nodes from a Pandas DataFrame into the graph. - - Arguments: - df (DataFrame): The Pandas DataFrame containing the nodes. - time (str): The column name for the timestamps. - id (str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - properties (List[str], optional): List of node property column names. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - - def load_nodes_from_parquet( - self, - parquet_path: str, - time: str, - id: str, - node_type: Optional[str] = None, - node_type_col: Optional[str] = None, - properties: Optional[List[str]] = None, - metadata: Optional[List[str]] = None, - shared_metadata: Optional[PropInput] = None, - ) -> None: - """ - Load nodes from a Parquet file into the graph. - - Arguments: - parquet_path (str): Parquet file or directory of Parquet files containing the nodes - time (str): The column name for the timestamps. - id (str): The column name for the node IDs. - node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - properties (List[str], optional): List of node property column names. Defaults to None. - metadata (List[str], optional): List of node metadata column names. Defaults to None. - shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - - Returns: - None: This function does not return a value, if the operation is successful. - - Raises: - GraphError: If the operation fails. - """ - def node(self, id: str | int) -> Optional[MutableNode]: """ Gets the node with the specified id @@ -2567,7 +1834,7 @@ class PersistentGraph(GraphView): None: """ -class Node(object): +class Node(object): """A node (or node) in the graph.""" def __eq__(self, value): @@ -2724,9 +1991,7 @@ class Node(object): Node: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -2799,7 +2064,7 @@ class Node(object): """ @property - def id(self) -> str | int: + def id(self) -> (str|int): """ Returns the id of the node. This is a unique identifier for the node. @@ -2954,12 +2219,7 @@ class Node(object): Properties: A list of properties. """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -3081,7 +2341,7 @@ class Node(object): Optional[int]: """ -class NodeFilterBuilder(object): +class NodeFilterBuilder(object): """ A builder for constructing node filters @@ -3117,9 +2377,7 @@ class NodeFilterBuilder(object): filter.FilterExpr: """ - def fuzzy_search( - self, value, levenshtein_distance: int, prefix_match: bool - ) -> filter.FilterExpr: + def fuzzy_search(self, value, levenshtein_distance: int, prefix_match: bool) -> filter.FilterExpr: """ Returns a filter expression that checks if the specified properties approximately match the specified string. @@ -3160,7 +2418,7 @@ class NodeFilterBuilder(object): """ Returns a filter expression that checks if the specified iterable of strings does not contain a given value. - + Arguments: value (str): @@ -3168,7 +2426,7 @@ class NodeFilterBuilder(object): filter.FilterExpr: """ -class Nodes(object): +class Nodes(object): """A list of nodes that can be iterated over.""" def __bool__(self): @@ -3339,9 +2597,7 @@ class Nodes(object): Nodes: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -3560,12 +2816,7 @@ class Nodes(object): PropertiesView: A view of the node properties. """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -3654,9 +2905,7 @@ class Nodes(object): OptionalEventTime: The earliest time that this Nodes is valid or None if the Nodes is valid for all times. """ - def to_df( - self, include_property_history: bool = False, convert_datetime: bool = False - ) -> DataFrame: + def to_df(self, include_property_history: bool = False, convert_datetime: bool = False) -> DataFrame: """ Converts the graph's nodes into a Pandas DataFrame. @@ -3717,7 +2966,8 @@ class Nodes(object): Optional[int]: """ -class PathFromNode(object): +class PathFromNode(object): + def __bool__(self): """True if self else False""" @@ -3873,9 +3123,7 @@ class PathFromNode(object): PathFromNode: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -4085,12 +3333,7 @@ class PathFromNode(object): PropertiesView: """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -4223,7 +3466,8 @@ class PathFromNode(object): Optional[int]: """ -class PathFromGraph(object): +class PathFromGraph(object): + def __bool__(self): """True if self else False""" @@ -4379,9 +3623,7 @@ class PathFromGraph(object): PathFromGraph: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -4600,12 +3842,7 @@ class PathFromGraph(object): NestedPropsIterable: """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -4738,7 +3975,8 @@ class PathFromGraph(object): Optional[int]: """ -class MutableNode(Node): +class MutableNode(Node): + def __repr__(self): """Return repr(self).""" @@ -4755,12 +3993,7 @@ class MutableNode(Node): None: """ - def add_updates( - self, - t: TimeInput, - properties: Optional[PropInput] = None, - event_id: Optional[int] = None, - ) -> None: + def add_updates(self, t: TimeInput, properties: Optional[PropInput] = None, event_id: Optional[int] = None) -> None: """ Add updates to a node in the graph at a specified time. This function allows for the addition of property updates to a node within the graph. The updates are time-stamped, meaning they are applied at the specified time. @@ -4805,7 +4038,7 @@ class MutableNode(Node): None: """ -class Edge(object): +class Edge(object): """ PyEdge is a Python class that represents an edge in the graph. An edge is a directed connection between two nodes. @@ -4958,9 +4191,7 @@ class Edge(object): Edge: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -5138,12 +4369,7 @@ class Edge(object): Properties: Properties on the Edge. """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -5283,7 +4509,7 @@ class Edge(object): Optional[int]: """ -class Edges(object): +class Edges(object): """A list of edges that can be iterated over.""" def __bool__(self): @@ -5434,9 +4660,7 @@ class Edges(object): Edges: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -5618,12 +4842,7 @@ class Edges(object): PropertiesView: """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -5730,12 +4949,7 @@ class Edges(object): EventTimeIterable: Iterable of `EventTime`s. """ - def to_df( - self, - include_property_history: bool = True, - convert_datetime: bool = False, - explode: bool = False, - ) -> DataFrame: + def to_df(self, include_property_history: bool = True, convert_datetime: bool = False, explode: bool = False) -> DataFrame: """ Converts the graph's edges into a Pandas DataFrame. @@ -5788,7 +5002,8 @@ class Edges(object): Optional[int]: """ -class NestedEdges(object): +class NestedEdges(object): + def __bool__(self): """True if self else False""" @@ -5929,9 +5144,7 @@ class NestedEdges(object): NestedEdges: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -6113,12 +5326,7 @@ class NestedEdges(object): PyNestedPropsIterable: """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -6261,7 +5469,8 @@ class NestedEdges(object): Optional[int]: """ -class MutableEdge(Edge): +class MutableEdge(Edge): + def __repr__(self): """Return repr(self).""" @@ -6279,13 +5488,7 @@ class MutableEdge(Edge): None: """ - def add_updates( - self, - t: TimeInput, - properties: Optional[PropInput] = None, - layer: Optional[str] = None, - event_id: Optional[int] = None, - ) -> None: + def add_updates(self, t: TimeInput, properties: Optional[PropInput] = None, layer: Optional[str] = None, event_id: Optional[int] = None) -> None: """ Add updates to an edge in the graph at a specified time. This function allows for the addition of property updates to an edge within the graph. The updates are time-stamped, meaning they are applied at the specified time. @@ -6303,9 +5506,7 @@ class MutableEdge(Edge): GraphError: If the operation fails. """ - def delete( - self, t: TimeInput, layer: Optional[str] = None, event_id: Optional[int] = None - ) -> None: + def delete(self, t: TimeInput, layer: Optional[str] = None, event_id: Optional[int] = None) -> None: """ Mark the edge as deleted at the specified time. @@ -6335,7 +5536,7 @@ class MutableEdge(Edge): None: """ -class Properties(object): +class Properties(object): """A view of the properties of an entity""" def __contains__(self, key): @@ -6393,6 +5594,18 @@ class Properties(object): PropValue: """ + def get_dtype_of(self, key: str) -> PropType: + """ + Get the PropType of a property. Specifically, returns the PropType of the latest value for this property if it exists. + If not, it returns the PropType for the static property matching this name. + + Arguments: + key (str): the name of the property. + + Returns: + PropType: + """ + def items(self) -> list[Tuple[str, PropValue]]: """ Get a list of key-value pairs @@ -6426,7 +5639,8 @@ class Properties(object): list[PropValue]: """ -class PyPropValueList(object): +class PyPropValueList(object): + def __eq__(self, value): """Return self==value.""" @@ -6462,8 +5676,12 @@ class PyPropValueList(object): PropValue: The average of each property values, or None if count is zero. """ - def collect(self): ... - def count(self): ... + def collect(self): + ... + + def count(self): + ... + def drop_none(self) -> list[PropValue]: """ Drop none. @@ -6512,7 +5730,90 @@ class PyPropValueList(object): PropValue: """ -class Metadata(object): +class PropType(object): + + def __eq__(self, value): + """Return self==value.""" + + def __ge__(self, value): + """Return self>=value.""" + + def __gt__(self, value): + """Return self>value.""" + + def __le__(self, value): + """Return self<=value.""" + + def __lt__(self, value): + """Return self tuple[int, int]: + def as_tuple(self) -> tuple[int,int]: """ Return this entry as a tuple of (timestamp, event_id), where the timestamp is in milliseconds. @@ -7027,7 +6339,7 @@ class EventTime(object): int: Milliseconds since the Unix epoch. """ -class OptionalEventTime(object): +class OptionalEventTime(object): """ Raphtory’s optional EventTime type. Instances of OptionalEventTime may contain an EventTime, or be empty. This is used for functions that may not return data (such as earliest_time and latest_time) because the data is unavailable. @@ -7121,7 +6433,7 @@ class OptionalEventTime(object): int | None: Milliseconds since the Unix epoch. """ -class History(object): +class History(object): """History of updates for an object. Provides access to time entries and derived views such as timestamps, datetimes, event ids, and intervals.""" def __contains__(self, key): @@ -7272,7 +6584,7 @@ class History(object): HistoryTimestamp: Timestamp (as int) view of this history. """ -class HistoryTimestamp(object): +class HistoryTimestamp(object): """History view that exposes timestamps in milliseconds since the Unix epoch.""" def __contains__(self, key): @@ -7345,7 +6657,7 @@ class HistoryTimestamp(object): list[int]: List of timestamps. """ -class HistoryDateTime(object): +class HistoryDateTime(object): """History view that exposes UTC datetimes.""" def __contains__(self, key): @@ -7411,7 +6723,7 @@ class HistoryDateTime(object): TimeError: If a timestamp cannot be converted to a datetime. """ -class HistoryEventId(object): +class HistoryEventId(object): """History view that exposes event ids of time entries. They are used for ordering within the same timestamp.""" def __contains__(self, key): @@ -7484,7 +6796,7 @@ class HistoryEventId(object): list[int]: List of event ids. """ -class Intervals(object): +class Intervals(object): """View over the intervals between consecutive timestamps, expressed in milliseconds.""" def __contains__(self, key): @@ -7589,7 +6901,8 @@ class Intervals(object): list[int]: List of intervals in milliseconds. """ -class WindowSet(object): +class WindowSet(object): + def __iter__(self): """Implement iter(self).""" @@ -7607,7 +6920,8 @@ class WindowSet(object): Iterable: The time index. """ -class IndexSpecBuilder(object): +class IndexSpecBuilder(object): + def __new__(cls, graph) -> IndexSpecBuilder: """Create and return a new object. See help(type) for accurate signature.""" @@ -7711,7 +7025,8 @@ class IndexSpecBuilder(object): dict[str, Any]: """ -class IndexSpec(object): +class IndexSpec(object): + def __repr__(self): """Return repr(self).""" diff --git a/python/python/raphtory/algorithms/__init__.pyi b/python/python/raphtory/algorithms/__init__.pyi index 3beca213a3..e7f5fee0f5 100644 --- a/python/python/raphtory/algorithms/__init__.pyi +++ b/python/python/raphtory/algorithms/__init__.pyi @@ -1,7 +1,6 @@ """ Algorithmic functions that can be run on Raphtory graphs """ - from __future__ import annotations ############################################################################### @@ -24,64 +23,14 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = [ - "dijkstra_single_source_shortest_paths", - "global_reciprocity", - "betweenness_centrality", - "all_local_reciprocity", - "triplet_count", - "local_triangle_count", - "average_degree", - "directed_graph_density", - "degree_centrality", - "max_degree", - "min_degree", - "max_out_degree", - "max_in_degree", - "min_out_degree", - "min_in_degree", - "pagerank", - "single_source_shortest_path", - "global_clustering_coefficient", - "temporally_reachable_nodes", - "temporal_bipartite_graph_projection", - "local_clustering_coefficient", - "local_clustering_coefficient_batch", - "weakly_connected_components", - "strongly_connected_components", - "in_components", - "in_component", - "out_components", - "out_component", - "fast_rp", - "global_temporal_three_node_motif", - "global_temporal_three_node_motif_multi", - "local_temporal_three_node_motifs", - "hits", - "balance", - "label_propagation", - "k_core", - "temporal_SEIR", - "louvain", - "fruchterman_reingold", - "cohesive_fruchterman_reingold", - "max_weight_matching", - "Matching", - "Infected", -] - -def dijkstra_single_source_shortest_paths( - graph: GraphView, - source: NodeInput, - targets: list[NodeInput], - direction: Direction = "both", - weight: str = "weight", -) -> NodeStateWeightedSP: +__all__ = ['dijkstra_single_source_shortest_paths', 'global_reciprocity', 'betweenness_centrality', 'all_local_reciprocity', 'triplet_count', 'local_triangle_count', 'average_degree', 'directed_graph_density', 'degree_centrality', 'max_degree', 'min_degree', 'max_out_degree', 'max_in_degree', 'min_out_degree', 'min_in_degree', 'pagerank', 'single_source_shortest_path', 'global_clustering_coefficient', 'temporally_reachable_nodes', 'temporal_bipartite_graph_projection', 'local_clustering_coefficient', 'local_clustering_coefficient_batch', 'weakly_connected_components', 'strongly_connected_components', 'in_components', 'in_component', 'out_components', 'out_component', 'fast_rp', 'global_temporal_three_node_motif', 'global_temporal_three_node_motif_multi', 'local_temporal_three_node_motifs', 'hits', 'balance', 'label_propagation', 'k_core', 'temporal_SEIR', 'louvain', 'fruchterman_reingold', 'cohesive_fruchterman_reingold', 'max_weight_matching', 'Matching', 'Infected'] +def dijkstra_single_source_shortest_paths(graph: GraphView, source: NodeInput, targets: list[NodeInput], direction: Direction = "both", weight: str = 'weight') -> NodeStateWeightedSP: """ Finds the shortest paths from a single source to multiple targets in a graph. @@ -111,9 +60,7 @@ def global_reciprocity(graph: GraphView) -> float: float: reciprocity of the graph between 0 and 1. """ -def betweenness_centrality( - graph: GraphView, k: Optional[int] = None, normalized: bool = True -) -> NodeStateF64: +def betweenness_centrality(graph: GraphView, k: Optional[int] = None, normalized: bool = True) -> NodeStateF64: """ Computes the betweenness centrality for nodes in a given graph. @@ -281,13 +228,7 @@ def min_in_degree(graph: GraphView) -> int: int: value of the smallest indegree """ -def pagerank( - graph: GraphView, - iter_count: int = 20, - max_diff: Optional[float] = None, - use_l2_norm: bool = True, - damping_factor: float = 0.85, -) -> NodeStateF64: +def pagerank(graph: GraphView, iter_count: int = 20, max_diff: Optional[float] = None, use_l2_norm: bool = True, damping_factor: float = 0.85) -> NodeStateF64: """ Pagerank -- pagerank centrality value of the nodes in a graph @@ -308,9 +249,7 @@ def pagerank( NodeStateF64: Mapping of nodes to their pagerank value. """ -def single_source_shortest_path( - graph: GraphView, source: NodeInput, cutoff: Optional[int] = None -) -> NodeStateNodes: +def single_source_shortest_path(graph: GraphView, source: NodeInput, cutoff: Optional[int] = None) -> NodeStateNodes: """ Calculates the single source shortest paths from a given source node. @@ -341,13 +280,7 @@ def global_clustering_coefficient(graph: GraphView) -> float: [`Triplet Count`](triplet_count) """ -def temporally_reachable_nodes( - graph: GraphView, - max_hops: int, - start_time: int, - seed_nodes: list[NodeInput], - stop_nodes: Optional[list[NodeInput]] = None, -) -> NodeStateReachability: +def temporally_reachable_nodes(graph: GraphView, max_hops: int, start_time: int, seed_nodes: list[NodeInput], stop_nodes: Optional[list[NodeInput]] = None) -> NodeStateReachability: """ Temporally reachable nodes -- the nodes that are reachable by a time respecting path followed out from a set of seed nodes at a starting time. @@ -366,9 +299,7 @@ def temporally_reachable_nodes( NodeStateReachability: Mapping of nodes to their reachability history. """ -def temporal_bipartite_graph_projection( - graph: GraphView, delta: int, pivot_type: str -) -> Graph: +def temporal_bipartite_graph_projection(graph: GraphView, delta: int, pivot_type: str) -> Graph: """ Projects a temporal bipartite graph into an undirected temporal graph over the pivot node type. Let `G` be a bipartite graph with node types `A` and `B`. Given `delta > 0`, the projection graph `G'` pivoting over type `B` nodes, will make a connection between nodes `n1` and `n2` (of type `A`) at time `(t1 + t2)/2` if they respectively have an edge at time `t1`, `t2` with the same node of type `B` in `G`, and `|t2-t1| < delta`. @@ -481,14 +412,7 @@ def out_component(node: Node) -> NodeStateUsize: NodeStateUsize: A NodeState mapping the nodes in the out-component to their distance from the starting node. """ -def fast_rp( - graph: GraphView, - embedding_dim: int, - normalization_strength: float, - iter_weights: list[float], - seed: Optional[int] = None, - threads: Optional[int] = None, -) -> NodeStateListF64: +def fast_rp(graph: GraphView, embedding_dim: int, normalization_strength: float, iter_weights: list[float], seed: Optional[int] = None, threads: Optional[int] = None) -> NodeStateListF64: """ Computes embedding vectors for each vertex of an undirected/bidirectional graph according to the Fast RP algorithm. Original Paper: https://doi.org/10.48550/arXiv.1908.11512 @@ -504,9 +428,7 @@ def fast_rp( NodeStateListF64: Mapping from nodes to embedding vectors. """ -def global_temporal_three_node_motif( - graph: GraphView, delta: int, threads: Optional[int] = None -) -> list[int]: +def global_temporal_three_node_motif(graph: GraphView, delta: int, threads: Optional[int] = None) -> list[int]: """ Computes the number of three edge, up-to-three node delta-temporal motifs in the graph, using the algorithm of Paranjape et al, Motifs in Temporal Networks (2017). We point the reader to this reference for more information on the algorithm and background, but provide a short summary below. @@ -555,9 +477,7 @@ def global_temporal_three_node_motif( """ -def global_temporal_three_node_motif_multi( - graph: GraphView, deltas: list[int], threads: Optional[int] = None -) -> list[list[int]]: +def global_temporal_three_node_motif_multi(graph: GraphView, deltas: list[int], threads: Optional[int] = None) -> list[list[int]]: """ Computes the global counts of three-edge up-to-three node temporal motifs for a range of timescales. See `global_temporal_three_node_motif` for an interpretation of each row returned. @@ -570,9 +490,7 @@ def global_temporal_three_node_motif_multi( list[list[int]]: A list of 40d arrays, each array is the motif count for a particular value of delta, returned in the order that the deltas were given as input. """ -def local_temporal_three_node_motifs( - graph: GraphView, delta: int, threads=None -) -> NodeStateMotifs: +def local_temporal_three_node_motifs(graph: GraphView, delta: int, threads=None) -> NodeStateMotifs: """ Computes the number of each type of motif that each node participates in. See global_temporal_three_node_motifs for a summary of the motifs involved. @@ -588,9 +506,7 @@ def local_temporal_three_node_motifs( the motif. For two node motifs, both constituent nodes count the motif. For triangles, all three constituent nodes count the motif. """ -def hits( - graph: GraphView, iter_count: int = 20, threads: Optional[int] = None -) -> NodeStateHits: +def hits(graph: GraphView, iter_count: int = 20, threads: Optional[int] = None) -> NodeStateHits: """ HITS (Hubs and Authority) Algorithm: @@ -609,9 +525,7 @@ def hits( NodeStateHits: A mapping from nodes their hub and authority scores """ -def balance( - graph: GraphView, name: str = "weight", direction: Direction = "both" -) -> NodeStateF64: +def balance(graph: GraphView, name: str = "weight", direction: Direction = "both") -> NodeStateF64: """ Sums the weights of edges in the graph based on the specified direction. @@ -630,9 +544,7 @@ def balance( """ -def label_propagation( - graph: GraphView, seed: Optional[bytes] = None -) -> list[set[Node]]: +def label_propagation(graph: GraphView, seed: Optional[bytes] = None) -> list[set[Node]]: """ Computes components using a label propagation algorithm @@ -645,9 +557,7 @@ def label_propagation( """ -def k_core( - graph: GraphView, k: int, iter_count: int, threads: Optional[int] = None -) -> list[Node]: +def k_core(graph: GraphView, k: int, iter_count: int, threads: Optional[int] = None) -> list[Node]: """ Determines which nodes are in the k-core for a given value of k @@ -662,15 +572,7 @@ def k_core( """ -def temporal_SEIR( - graph: GraphView, - seeds: int | float | list[NodeInput], - infection_prob: float, - initial_infection: int | str | datetime, - recovery_rate: float | None = None, - incubation_rate: float | None = None, - rng_seed: int | None = None, -) -> NodeStateSEIR: +def temporal_SEIR(graph: GraphView, seeds: int | float | list[NodeInput], infection_prob: float, initial_infection: int | str | datetime, recovery_rate: float | None = None, incubation_rate: float | None = None, rng_seed: int | None = None) -> NodeStateSEIR: """ Simulate an SEIR dynamic on the network @@ -700,12 +602,7 @@ def temporal_SEIR( """ -def louvain( - graph: GraphView, - resolution: float = 1.0, - weight_prop: str | None = None, - tol: None | float = None, -) -> NodeStateUsize: +def louvain(graph: GraphView, resolution: float = 1.0, weight_prop: str | None = None, tol: None | float = None) -> NodeStateUsize: """ Louvain algorithm for community detection @@ -719,14 +616,7 @@ def louvain( NodeStateUsize: Mapping of nodes to their community assignment """ -def fruchterman_reingold( - graph: GraphView, - iterations: int | None = 100, - scale: float | None = 1.0, - node_start_size: float | None = 1.0, - cooloff_factor: float | None = 0.95, - dt: float | None = 0.1, -) -> NodeLayout: +def fruchterman_reingold(graph: GraphView, iterations: int | None = 100, scale: float | None = 1.0, node_start_size: float | None = 1.0, cooloff_factor: float | None = 0.95, dt: float | None = 0.1) -> NodeLayout: """ Fruchterman Reingold layout algorithm @@ -742,14 +632,7 @@ def fruchterman_reingold( NodeLayout: A mapping from nodes to their [x, y] positions """ -def cohesive_fruchterman_reingold( - graph: GraphView, - iter_count: int = 100, - scale: float = 1.0, - node_start_size: float = 1.0, - cooloff_factor: float = 0.95, - dt: float = 0.1, -) -> NodeLayout: +def cohesive_fruchterman_reingold(graph: GraphView, iter_count: int = 100, scale: float = 1.0, node_start_size: float = 1.0, cooloff_factor: float = 0.95, dt: float = 0.1) -> NodeLayout: """ Cohesive version of `fruchterman_reingold` that adds virtual edges between isolated nodes Arguments: @@ -765,12 +648,7 @@ def cohesive_fruchterman_reingold( """ -def max_weight_matching( - graph: GraphView, - weight_prop: Optional[str] = None, - max_cardinality: bool = True, - verify_optimum_flag: bool = False, -) -> Matching: +def max_weight_matching(graph: GraphView, weight_prop: Optional[str] = None, max_cardinality: bool = True, verify_optimum_flag: bool = False) -> Matching: """ Compute a maximum-weighted matching in the general undirected weighted graph given by "edges". If `max_cardinality` is true, only @@ -807,7 +685,7 @@ def max_weight_matching( Matching: The matching """ -class Matching(object): +class Matching(object): """A Matching (i.e., a set of edges that do not share any nodes)""" def __bool__(self): @@ -879,7 +757,8 @@ class Matching(object): """ -class Infected(object): +class Infected(object): + def __repr__(self): """Return repr(self).""" diff --git a/python/python/raphtory/filter/__init__.pyi b/python/python/raphtory/filter/__init__.pyi index a87573a7db..0629934002 100644 --- a/python/python/raphtory/filter/__init__.pyi +++ b/python/python/raphtory/filter/__init__.pyi @@ -20,25 +20,15 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = [ - "FilterExpr", - "PropertyFilterOps", - "NodeFilterBuilder", - "Node", - "EdgeFilterOp", - "EdgeEndpoint", - "Edge", - "Property", - "Metadata", - "TemporalPropertyFilterBuilder", -] - -class FilterExpr(object): +__all__ = ['FilterExpr', 'PropertyFilterOps', 'NodeFilterBuilder', 'Node', 'EdgeFilterOp', 'EdgeEndpoint', 'Edge', 'Property', 'Metadata', 'TemporalPropertyFilterBuilder'] +class FilterExpr(object): + def __and__(self, value): """Return self&value.""" @@ -54,7 +44,8 @@ class FilterExpr(object): def __ror__(self, value): """Return value|self.""" -class PropertyFilterOps(object): +class PropertyFilterOps(object): + def __eq__(self, value): """Return self==value.""" @@ -76,7 +67,7 @@ class PropertyFilterOps(object): def contains(self, value) -> filter.FilterExpr: """ Returns a filter expression that checks if this object contains a specified property. - + Arguments: PropValue: @@ -84,9 +75,7 @@ class PropertyFilterOps(object): filter.FilterExpr: """ - def fuzzy_search( - self, prop_value: str, levenshtein_distance: int, prefix_match: bool - ) -> filter.FilterExpr: + def fuzzy_search(self, prop_value: str, levenshtein_distance: int, prefix_match: bool) -> filter.FilterExpr: """ Returns a filter expression that checks if the specified properties approximately match the specified string. @@ -96,7 +85,7 @@ class PropertyFilterOps(object): prop_value (str): Property to match against. levenshtein_distance (int): Maximum levenshtein distance between the specified prop_value and the result. prefix_match (bool): Enable prefix matching. - + Returns: filter.FilterExpr: """ @@ -104,7 +93,7 @@ class PropertyFilterOps(object): def is_in(self, values: list[PropValue]) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value is in a specified iterable of properties. - + Arguments: values (list[PropValue]): @@ -115,7 +104,7 @@ class PropertyFilterOps(object): def is_none(self) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value is none. - + Returns: filter.FilterExpr: """ @@ -123,7 +112,7 @@ class PropertyFilterOps(object): def is_not_in(self, values: list[PropValue]) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value is not in a specified iterable of properties. - + Arguments: values (list[PropValue]): @@ -134,7 +123,7 @@ class PropertyFilterOps(object): def is_some(self) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value is some. - + Returns: filter.FilterExpr: """ @@ -142,7 +131,7 @@ class PropertyFilterOps(object): def not_contains(self, value) -> filter.FilterExpr: """ Returns a filter expression that checks if this object does not contain a specified property. - + Arguments: PropValue: @@ -150,7 +139,7 @@ class PropertyFilterOps(object): filter.FilterExpr: """ -class NodeFilterBuilder(object): +class NodeFilterBuilder(object): """ A builder for constructing node filters @@ -186,9 +175,7 @@ class NodeFilterBuilder(object): filter.FilterExpr: """ - def fuzzy_search( - self, value, levenshtein_distance: int, prefix_match: bool - ) -> filter.FilterExpr: + def fuzzy_search(self, value, levenshtein_distance: int, prefix_match: bool) -> filter.FilterExpr: """ Returns a filter expression that checks if the specified properties approximately match the specified string. @@ -229,7 +216,7 @@ class NodeFilterBuilder(object): """ Returns a filter expression that checks if the specified iterable of strings does not contain a given value. - + Arguments: value (str): @@ -237,7 +224,8 @@ class NodeFilterBuilder(object): filter.FilterExpr: """ -class Node(object): +class Node(object): + @staticmethod def name() -> NodeFilterBuilder: """ @@ -256,7 +244,8 @@ class Node(object): NodeFilterBuilder: A filter builder for filtering by node type """ -class EdgeFilterOp(object): +class EdgeFilterOp(object): + def __eq__(self, value): """Return self==value.""" @@ -278,7 +267,7 @@ class EdgeFilterOp(object): def contains(self, value: str) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value contains the specified string. - + Arguments: value (str): @@ -286,9 +275,7 @@ class EdgeFilterOp(object): filter.FilterExpr: """ - def fuzzy_search( - self, value, levenshtein_distance: int, prefix_match: bool - ) -> filter.FilterExpr: + def fuzzy_search(self, value, levenshtein_distance: int, prefix_match: bool) -> filter.FilterExpr: """ Returns a filter expression that checks if the specified properties approximately match the specified string. @@ -298,7 +285,7 @@ class EdgeFilterOp(object): prop_value (str): Property to match against. levenshtein_distance (int): Maximum levenshtein distance between the specified prop_value and the result. prefix_match (bool): Enable prefix matching. - + Returns: filter.FilterExpr: """ @@ -306,7 +293,7 @@ class EdgeFilterOp(object): def is_in(self, values: list[str]) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value is contained within the specified iterable of strings. - + Arguments: values (list[str]): @@ -317,7 +304,7 @@ class EdgeFilterOp(object): def is_not_in(self, values: list[str]) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value is not contained within the provided iterable of strings. - + Arguments: values (list[str]): @@ -328,7 +315,7 @@ class EdgeFilterOp(object): def not_contains(self, value: str) -> filter.FilterExpr: """ Returns a filter expression that checks if a given value does not contain the specified string. - + Arguments: value (str): @@ -336,16 +323,22 @@ class EdgeFilterOp(object): filter.FilterExpr: """ -class EdgeEndpoint(object): - def name(self): ... +class EdgeEndpoint(object): + + def name(self): + ... + +class Edge(object): -class Edge(object): @staticmethod - def dst(): ... + def dst(): + ... + @staticmethod - def src(): ... + def src(): + ... -class Property(PropertyFilterOps): +class Property(PropertyFilterOps): """ Construct a property filter @@ -356,9 +349,10 @@ class Property(PropertyFilterOps): def __new__(cls, name: str) -> Property: """Create and return a new object. See help(type) for accurate signature.""" - def temporal(self): ... + def temporal(self): + ... -class Metadata(PropertyFilterOps): +class Metadata(PropertyFilterOps): """ Construct a metadata filter @@ -369,6 +363,10 @@ class Metadata(PropertyFilterOps): def __new__(cls, name: str) -> Metadata: """Create and return a new object. See help(type) for accurate signature.""" -class TemporalPropertyFilterBuilder(object): - def any(self): ... - def latest(self): ... +class TemporalPropertyFilterBuilder(object): + + def any(self): + ... + + def latest(self): + ... diff --git a/python/python/raphtory/graph_gen/__init__.pyi b/python/python/raphtory/graph_gen/__init__.pyi index 9a77fd5dfe..23f524982d 100644 --- a/python/python/raphtory/graph_gen/__init__.pyi +++ b/python/python/raphtory/graph_gen/__init__.pyi @@ -1,7 +1,6 @@ """ Generate Raphtory graphs from attachment models """ - from __future__ import annotations ############################################################################### @@ -25,13 +24,13 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = ["random_attachment", "ba_preferential_attachment"] - +__all__ = ['random_attachment', 'ba_preferential_attachment'] def random_attachment(g: Any, nodes_to_add: Any, edges_per_step: Any, seed: Any = None): """ Generates a graph using the random attachment model @@ -50,9 +49,7 @@ def random_attachment(g: Any, nodes_to_add: Any, edges_per_step: Any, seed: Any None """ -def ba_preferential_attachment( - g: Any, nodes_to_add: Any, edges_per_step: Any, seed: Any = None -): +def ba_preferential_attachment(g: Any, nodes_to_add: Any, edges_per_step: Any, seed: Any = None): """ Generates a graph using the preferential attachment model. diff --git a/python/python/raphtory/graph_loader/__init__.pyi b/python/python/raphtory/graph_loader/__init__.pyi index 192b78ebdd..c1704f4bc4 100644 --- a/python/python/raphtory/graph_loader/__init__.pyi +++ b/python/python/raphtory/graph_loader/__init__.pyi @@ -1,7 +1,6 @@ """ Load and save Raphtory graphs from/to file(s) """ - from __future__ import annotations ############################################################################### @@ -25,21 +24,13 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = [ - "lotr_graph", - "lotr_graph_with_props", - "neo4j_movie_graph", - "stable_coin_graph", - "reddit_hyperlink_graph", - "reddit_hyperlink_graph_local", - "karate_club_graph", -] - +__all__ = ['lotr_graph', 'lotr_graph_with_props', 'neo4j_movie_graph', 'stable_coin_graph', 'reddit_hyperlink_graph', 'reddit_hyperlink_graph_local', 'karate_club_graph'] def lotr_graph() -> Graph: """ Load the Lord of the Rings dataset into a graph. @@ -68,9 +59,7 @@ def lotr_graph_with_props() -> Graph: Graph: """ -def neo4j_movie_graph( - uri: str, username: str, password: str, database: str = ... -) -> Graph: +def neo4j_movie_graph(uri: str, username: str, password: str, database: str = ...) -> Graph: """ Returns the neo4j movie graph example. diff --git a/python/python/raphtory/graphql/__init__.pyi b/python/python/raphtory/graphql/__init__.pyi index a95c9f6125..b38bec1fc3 100644 --- a/python/python/raphtory/graphql/__init__.pyi +++ b/python/python/raphtory/graphql/__init__.pyi @@ -20,31 +20,14 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = [ - "GraphServer", - "RunningGraphServer", - "RaphtoryClient", - "RemoteGraph", - "RemoteEdge", - "RemoteNode", - "RemoteNodeAddition", - "RemoteUpdate", - "RemoteEdgeAddition", - "RemoteIndexSpec", - "PropsInput", - "SomePropertySpec", - "AllPropertySpec", - "encode_graph", - "decode_graph", - "schema", -] - -class GraphServer(object): +__all__ = ['GraphServer', 'RunningGraphServer', 'RaphtoryClient', 'RemoteGraph', 'RemoteEdge', 'RemoteNode', 'RemoteNodeAddition', 'RemoteUpdate', 'RemoteEdgeAddition', 'RemoteIndexSpec', 'PropsInput', 'SomePropertySpec', 'AllPropertySpec', 'encode_graph', 'decode_graph', 'schema'] +class GraphServer(object): """ A class for defining and running a Raphtory GraphQL server @@ -63,22 +46,7 @@ class GraphServer(object): create_index: """ - def __new__( - cls, - work_dir: str | PathLike, - cache_capacity: Optional[int] = None, - cache_tti_seconds: Optional[int] = None, - log_level: Optional[str] = None, - tracing: Optional[bool] = None, - tracing_level=None, - otlp_agent_host: Optional[str] = None, - otlp_agent_port: Optional[str] = None, - otlp_tracing_service_name: Optional[str] = None, - auth_public_key: Any = None, - auth_enabled_for_reads: Any = None, - config_path: Optional[str | PathLike] = None, - create_index: Any = None, - ) -> GraphServer: + def __new__(cls, work_dir: str | PathLike, cache_capacity: Optional[int] = None, cache_tti_seconds: Optional[int] = None, log_level: Optional[str] = None, tracing: Optional[bool] = None, tracing_level=None, otlp_agent_host: Optional[str] = None, otlp_agent_port: Optional[str] = None, otlp_tracing_service_name: Optional[str] = None, auth_public_key: Any = None, auth_enabled_for_reads: Any = None, config_path: Optional[str | PathLike] = None, create_index: Any = None) -> GraphServer: """Create and return a new object. See help(type) for accurate signature.""" def run(self, port: int = 1736, timeout_ms: int = 180000) -> None: @@ -93,13 +61,7 @@ class GraphServer(object): None: """ - def set_embeddings( - self, - cache: str, - embedding: Optional[Callable] = None, - nodes: bool | str = True, - edges: bool | str = True, - ) -> GraphServer: + def set_embeddings(self, cache: str, embedding: Optional[Callable] = None, nodes: bool | str = True, edges: bool | str = True) -> GraphServer: """ Setup the server to vectorise graphs with a default template. @@ -135,9 +97,7 @@ class GraphServer(object): GraphServer: The server with indexing disabled """ - def with_vectorised_graphs( - self, graph_names: list[str], nodes: bool | str = True, edges: bool | str = True - ) -> GraphServer: + def with_vectorised_graphs(self, graph_names: list[str], nodes: bool | str = True, edges: bool | str = True) -> GraphServer: """ Vectorise a subset of the graphs of the server. @@ -150,11 +110,15 @@ class GraphServer(object): GraphServer: A new server object containing the vectorised graphs. """ -class RunningGraphServer(object): +class RunningGraphServer(object): """A Raphtory server handler that also enables querying the server""" - def __enter__(self): ... - def __exit__(self, _exc_type, _exc_val, _exc_tb): ... + def __enter__(self): + ... + + def __exit__(self, _exc_type, _exc_val, _exc_tb): + ... + def get_client(self): """ Get the client for the server @@ -171,7 +135,7 @@ class RunningGraphServer(object): None: """ -class RaphtoryClient(object): +class RaphtoryClient(object): """ A client for handling GraphQL operations in the context of Raphtory. @@ -253,9 +217,7 @@ class RaphtoryClient(object): """ - def query( - self, query: str, variables: Optional[dict[str, Any]] = None - ) -> dict[str, Any]: + def query(self, query: str, variables: Optional[dict[str, Any]] = None) -> dict[str, Any]: """ Make a GraphQL query against the server. @@ -293,9 +255,7 @@ class RaphtoryClient(object): """ - def send_graph( - self, path: str, graph: Graph | PersistentGraph, overwrite: bool = False - ) -> dict[str, Any]: + def send_graph(self, path: str, graph: Graph | PersistentGraph, overwrite: bool = False) -> dict[str, Any]: """ Send a graph to the server @@ -308,9 +268,7 @@ class RaphtoryClient(object): dict[str, Any]: The data field from the graphQL response after executing the mutation. """ - def upload_graph( - self, path: str, file_path: str, overwrite: bool = False - ) -> dict[str, Any]: + def upload_graph(self, path: str, file_path: str, overwrite: bool = False) -> dict[str, Any]: """ Upload graph file from a path file_path on the client @@ -323,15 +281,9 @@ class RaphtoryClient(object): dict[str, Any]: The data field from the graphQL response after executing the mutation. """ -class RemoteGraph(object): - def add_edge( - self, - timestamp: int | str | datetime, - src: str | int, - dst: str | int, - properties: Optional[dict] = None, - layer: Optional[str] = None, - ) -> RemoteEdge: +class RemoteGraph(object): + + def add_edge(self, timestamp: int | str | datetime, src: str | int, dst: str | int, properties: Optional[dict] = None, layer: Optional[str] = None) -> RemoteEdge: """ Adds a new edge with the given source and destination nodes and properties to the remote graph. @@ -368,13 +320,7 @@ class RemoteGraph(object): None: """ - def add_node( - self, - timestamp: int | str | datetime, - id: str | int, - properties: Optional[dict] = None, - node_type: Optional[str] = None, - ) -> RemoteNode: + def add_node(self, timestamp: int | str | datetime, id: str | int, properties: Optional[dict] = None, node_type: Optional[str] = None) -> RemoteNode: """ Adds a new node with the given id and properties to the remote graph. @@ -411,13 +357,7 @@ class RemoteGraph(object): None: """ - def create_node( - self, - timestamp: int | str | datetime, - id: str | int, - properties: Optional[dict] = None, - node_type: Optional[str] = None, - ) -> RemoteNode: + def create_node(self, timestamp: int | str | datetime, id: str | int, properties: Optional[dict] = None, node_type: Optional[str] = None) -> RemoteNode: """ Create a new node with the given id and properties to the remote graph and fail if the node already exists. @@ -431,13 +371,7 @@ class RemoteGraph(object): RemoteNode: the new remote node """ - def delete_edge( - self, - timestamp: int, - src: str | int, - dst: str | int, - layer: Optional[str] = None, - ) -> RemoteEdge: + def delete_edge(self, timestamp: int, src: str | int, dst: str | int, layer: Optional[str] = None) -> RemoteEdge: """ Deletes an edge in the remote graph, given the timestamp, src and dst nodes and layer (optional) @@ -485,7 +419,7 @@ class RemoteGraph(object): None: """ -class RemoteEdge(object): +class RemoteEdge(object): """ A remote edge reference @@ -494,9 +428,7 @@ class RemoteEdge(object): and [RemoteGraph.delete_edge][raphtory.graphql.RemoteGraph.delete_edge]. """ - def add_metadata( - self, properties: dict[str, PropValue], layer: Optional[str] = None - ) -> None: + def add_metadata(self, properties: dict[str, PropValue], layer: Optional[str] = None) -> None: """ Add metadata to the edge within the remote graph. This function is used to add metadata to an edge that does not @@ -510,12 +442,7 @@ class RemoteEdge(object): None: """ - def add_updates( - self, - t: int | str | datetime, - properties: Optional[dict[str, PropValue]] = None, - layer: Optional[str] = None, - ) -> None: + def add_updates(self, t: int | str | datetime, properties: Optional[dict[str, PropValue]] = None, layer: Optional[str] = None) -> None: """ Add updates to an edge in the remote graph at a specified time. @@ -546,9 +473,7 @@ class RemoteEdge(object): GraphError: If the operation fails. """ - def update_metadata( - self, properties: dict[str, PropValue], layer: Optional[str] = None - ) -> None: + def update_metadata(self, properties: dict[str, PropValue], layer: Optional[str] = None) -> None: """ Update metadata of an edge in the remote graph overwriting existing values. This function is used to add properties to an edge that does not @@ -562,7 +487,8 @@ class RemoteEdge(object): None: """ -class RemoteNode(object): +class RemoteNode(object): + def add_metadata(self, properties: dict[str, PropValue]) -> None: """ Add metadata to a node in the remote graph. @@ -576,9 +502,7 @@ class RemoteNode(object): None: """ - def add_updates( - self, t: int | str | datetime, properties: Optional[dict[str, PropValue]] = None - ) -> None: + def add_updates(self, t: int | str | datetime, properties: Optional[dict[str, PropValue]] = None) -> None: """ Add updates to a node in the remote graph at a specified time. This function allows for the addition of property updates to a node within the graph. The updates are time-stamped, meaning they are applied at the specified time. @@ -616,7 +540,7 @@ class RemoteNode(object): None: """ -class RemoteNodeAddition(object): +class RemoteNodeAddition(object): """ Node addition update @@ -627,16 +551,10 @@ class RemoteNodeAddition(object): updates (list[RemoteUpdate], optional): the temporal updates """ - def __new__( - cls, - name: GID, - node_type: Optional[str] = None, - metadata: Optional[PropInput] = None, - updates: Optional[list[RemoteUpdate]] = None, - ) -> RemoteNodeAddition: + def __new__(cls, name: GID, node_type: Optional[str] = None, metadata: Optional[PropInput] = None, updates: Optional[list[RemoteUpdate]] = None) -> RemoteNodeAddition: """Create and return a new object. See help(type) for accurate signature.""" -class RemoteUpdate(object): +class RemoteUpdate(object): """ A temporal update @@ -645,12 +563,10 @@ class RemoteUpdate(object): properties (PropInput, optional): the properties for the update """ - def __new__( - cls, time: TimeInput, properties: Optional[PropInput] = None - ) -> RemoteUpdate: + def __new__(cls, time: TimeInput, properties: Optional[PropInput] = None) -> RemoteUpdate: """Create and return a new object. See help(type) for accurate signature.""" -class RemoteEdgeAddition(object): +class RemoteEdgeAddition(object): """ An edge update @@ -662,17 +578,10 @@ class RemoteEdgeAddition(object): updates (list[RemoteUpdate], optional): the temporal updates for the edge """ - def __new__( - cls, - src: GID, - dst: GID, - layer: Optional[str] = None, - metadata: Optional[PropInput] = None, - updates: Optional[list[RemoteUpdate]] = None, - ) -> RemoteEdgeAddition: + def __new__(cls, src: GID, dst: GID, layer: Optional[str] = None, metadata: Optional[PropInput] = None, updates: Optional[list[RemoteUpdate]] = None) -> RemoteEdgeAddition: """Create and return a new object. See help(type) for accurate signature.""" -class RemoteIndexSpec(object): +class RemoteIndexSpec(object): """ Create a RemoteIndexSpec specifying which node and edge properties to index. @@ -684,7 +593,7 @@ class RemoteIndexSpec(object): def __new__(cls, node_props: PropsInput, edge_props: PropsInput) -> RemoteIndexSpec: """Create and return a new object. See help(type) for accurate signature.""" -class PropsInput(object): +class PropsInput(object): """ Create a PropsInput by choosing to include all/some properties explicitly. @@ -696,14 +605,10 @@ class PropsInput(object): ValueError: If neither all and some are specified. """ - def __new__( - cls, - all: Optional[AllPropertySpec] = None, - some: Optional[SomePropertySpec] = None, - ) -> PropsInput: + def __new__(cls, all: Optional[AllPropertySpec] = None, some: Optional[SomePropertySpec] = None) -> PropsInput: """Create and return a new object. See help(type) for accurate signature.""" -class SomePropertySpec(object): +class SomePropertySpec(object): """ Create a SomePropertySpec by explicitly listing metadata and/or temporal property names. @@ -712,12 +617,10 @@ class SomePropertySpec(object): properties (list[str]): Temporal property names. Defaults to []. """ - def __new__( - cls, metadata: list[str] = [], properties: list[str] = [] - ) -> SomePropertySpec: + def __new__(cls, metadata: list[str] = [], properties: list[str] = []) -> SomePropertySpec: """Create and return a new object. See help(type) for accurate signature.""" -class AllPropertySpec(object): +class AllPropertySpec(object): """ Specifies that **all** properties should be included when creating an index. Use one of the predefined variants: ALL , ALL_METADATA , or ALL_TEMPORAL . diff --git a/python/python/raphtory/iterables/__init__.pyi b/python/python/raphtory/iterables/__init__.pyi index 271dfa8518..12233100d5 100644 --- a/python/python/raphtory/iterables/__init__.pyi +++ b/python/python/raphtory/iterables/__init__.pyi @@ -21,60 +21,14 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore -__all__ = [ - "NestedUtcDateTimeIterable", - "NestedGIDIterable", - "GIDIterable", - "StringIterable", - "OptionArcStringIterable", - "UsizeIterable", - "OptionI64Iterable", - "NestedOptionArcStringIterable", - "NestedStringIterable", - "NestedOptionI64Iterable", - "NestedI64VecIterable", - "NestedUsizeIterable", - "BoolIterable", - "ArcStringIterable", - "NestedVecUtcDateTimeIterable", - "OptionVecUtcDateTimeIterable", - "GIDGIDIterable", - "NestedGIDGIDIterable", - "NestedBoolIterable", - "U64Iterable", - "OptionUtcDateTimeIterable", - "ArcStringVecIterable", - "NestedArcStringVecIterable", - "NestedEventTimeIterable", - "NestedArcStringIterable", - "NestedOptionEventTimeIterable", - "NestedHistoryIterable", - "EventTimeIterable", - "OptionEventTimeIterable", - "HistoryIterable", - "HistoryTimestampIterable", - "IntervalsIterable", - "HistoryEventIdIterable", - "HistoryDateTimeIterable", - "OptionUsizeIterable", - "ResultOptionUtcDateTimeIterable", - "I64Iterable", - "ResultUtcDateTimeIterable", - "NestedHistoryTimestampIterable", - "NestedIntervalsIterable", - "NestedHistoryEventIdIterable", - "NestedHistoryDateTimeIterable", - "NestedOptionUsizeIterable", - "NestedResultOptionUtcDateTimeIterable", - "NestedI64Iterable", - "NestedResultUtcDateTimeIterable", -] - -class NestedUtcDateTimeIterable(object): +__all__ = ['NestedUtcDateTimeIterable', 'NestedGIDIterable', 'GIDIterable', 'StringIterable', 'OptionArcStringIterable', 'UsizeIterable', 'OptionI64Iterable', 'NestedOptionArcStringIterable', 'NestedStringIterable', 'NestedOptionI64Iterable', 'NestedI64VecIterable', 'NestedUsizeIterable', 'BoolIterable', 'ArcStringIterable', 'NestedVecUtcDateTimeIterable', 'OptionVecUtcDateTimeIterable', 'GIDGIDIterable', 'NestedGIDGIDIterable', 'NestedBoolIterable', 'U64Iterable', 'OptionUtcDateTimeIterable', 'ArcStringVecIterable', 'NestedArcStringVecIterable', 'NestedEventTimeIterable', 'NestedArcStringIterable', 'NestedOptionEventTimeIterable', 'NestedHistoryIterable', 'EventTimeIterable', 'OptionEventTimeIterable', 'HistoryIterable', 'HistoryTimestampIterable', 'IntervalsIterable', 'HistoryEventIdIterable', 'HistoryDateTimeIterable', 'OptionUsizeIterable', 'ResultOptionUtcDateTimeIterable', 'I64Iterable', 'ResultUtcDateTimeIterable', 'NestedHistoryTimestampIterable', 'NestedIntervalsIterable', 'NestedHistoryEventIdIterable', 'NestedHistoryDateTimeIterable', 'NestedOptionUsizeIterable', 'NestedResultOptionUtcDateTimeIterable', 'NestedI64Iterable', 'NestedResultUtcDateTimeIterable'] +class NestedUtcDateTimeIterable(object): + def __eq__(self, value): """Return self==value.""" @@ -102,9 +56,11 @@ class NestedUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedGIDIterable(object): -class NestedGIDIterable(object): def __eq__(self, value): """Return self==value.""" @@ -132,11 +88,17 @@ class NestedGIDIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class GIDIterable(object): -class GIDIterable(object): def __eq__(self, value): """Return self==value.""" @@ -164,11 +126,17 @@ class GIDIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class StringIterable(object): -class StringIterable(object): def __eq__(self, value): """Return self==value.""" @@ -196,9 +164,11 @@ class StringIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class OptionArcStringIterable(object): -class OptionArcStringIterable(object): def __eq__(self, value): """Return self==value.""" @@ -226,9 +196,11 @@ class OptionArcStringIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class UsizeIterable(object): -class UsizeIterable(object): def __eq__(self, value): """Return self==value.""" @@ -256,13 +228,23 @@ class UsizeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def mean(self): ... - def min(self): ... - def sum(self): ... + def collect(self): + ... + + def max(self): + ... + + def mean(self): + ... + + def min(self): + ... + + def sum(self): + ... + +class OptionI64Iterable(object): -class OptionI64Iterable(object): def __eq__(self, value): """Return self==value.""" @@ -290,11 +272,17 @@ class OptionI64Iterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class NestedOptionArcStringIterable(object): -class NestedOptionArcStringIterable(object): def __eq__(self, value): """Return self==value.""" @@ -322,9 +310,11 @@ class NestedOptionArcStringIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedStringIterable(object): -class NestedStringIterable(object): def __eq__(self, value): """Return self==value.""" @@ -352,9 +342,11 @@ class NestedStringIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedOptionI64Iterable(object): -class NestedOptionI64Iterable(object): def __eq__(self, value): """Return self==value.""" @@ -382,11 +374,17 @@ class NestedOptionI64Iterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class NestedI64VecIterable(object): -class NestedI64VecIterable(object): def __eq__(self, value): """Return self==value.""" @@ -414,9 +412,11 @@ class NestedI64VecIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedUsizeIterable(object): -class NestedUsizeIterable(object): def __eq__(self, value): """Return self==value.""" @@ -444,13 +444,23 @@ class NestedUsizeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def mean(self): ... - def min(self): ... - def sum(self): ... + def collect(self): + ... + + def max(self): + ... + + def mean(self): + ... + + def min(self): + ... + + def sum(self): + ... + +class BoolIterable(object): -class BoolIterable(object): def __eq__(self, value): """Return self==value.""" @@ -478,9 +488,11 @@ class BoolIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class ArcStringIterable(object): -class ArcStringIterable(object): def __iter__(self): """Implement iter(self).""" @@ -490,9 +502,11 @@ class ArcStringIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedVecUtcDateTimeIterable(object): -class NestedVecUtcDateTimeIterable(object): def __eq__(self, value): """Return self==value.""" @@ -520,9 +534,11 @@ class NestedVecUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class OptionVecUtcDateTimeIterable(object): -class OptionVecUtcDateTimeIterable(object): def __eq__(self, value): """Return self==value.""" @@ -550,9 +566,11 @@ class OptionVecUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class GIDGIDIterable(object): -class GIDGIDIterable(object): def __eq__(self, value): """Return self==value.""" @@ -580,11 +598,17 @@ class GIDGIDIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class NestedGIDGIDIterable(object): -class NestedGIDGIDIterable(object): def __eq__(self, value): """Return self==value.""" @@ -612,11 +636,17 @@ class NestedGIDGIDIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class NestedBoolIterable(object): -class NestedBoolIterable(object): def __eq__(self, value): """Return self==value.""" @@ -644,9 +674,11 @@ class NestedBoolIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class U64Iterable(object): -class U64Iterable(object): def __eq__(self, value): """Return self==value.""" @@ -674,13 +706,23 @@ class U64Iterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def mean(self): ... - def min(self): ... - def sum(self): ... + def collect(self): + ... + + def max(self): + ... + + def mean(self): + ... + + def min(self): + ... + + def sum(self): + ... + +class OptionUtcDateTimeIterable(object): -class OptionUtcDateTimeIterable(object): def __eq__(self, value): """Return self==value.""" @@ -708,9 +750,11 @@ class OptionUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class ArcStringVecIterable(object): -class ArcStringVecIterable(object): def __eq__(self, value): """Return self==value.""" @@ -738,9 +782,11 @@ class ArcStringVecIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedArcStringVecIterable(object): -class NestedArcStringVecIterable(object): def __eq__(self, value): """Return self==value.""" @@ -768,9 +814,11 @@ class NestedArcStringVecIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedEventTimeIterable(object): -class NestedEventTimeIterable(object): def __eq__(self, value): """Return self==value.""" @@ -798,7 +846,9 @@ class NestedEventTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + @property def dt(self) -> NestedResultUtcDateTimeIterable: """ @@ -820,8 +870,12 @@ class NestedEventTimeIterable(object): NestedUsizeIterable: Nested iterable of event ids associated to each EventTime. """ - def max(self): ... - def min(self): ... + def max(self): + ... + + def min(self): + ... + @property def t(self) -> NestedI64Iterable: """ @@ -831,7 +885,8 @@ class NestedEventTimeIterable(object): NestedI64Iterable: Nested iterable of millisecond timestamps since the Unix epoch for each EventTime. """ -class NestedArcStringIterable(object): +class NestedArcStringIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -841,9 +896,11 @@ class NestedArcStringIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedOptionEventTimeIterable(object): -class NestedOptionEventTimeIterable(object): def __eq__(self, value): """Return self==value.""" @@ -871,7 +928,9 @@ class NestedOptionEventTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + @property def dt(self) -> NestedResultOptionUtcDateTimeIterable: """ @@ -893,8 +952,12 @@ class NestedOptionEventTimeIterable(object): NestedOptionUsizeIterable: Nested iterable of event ids associated to each EventTime, if available. """ - def max(self): ... - def min(self): ... + def max(self): + ... + + def min(self): + ... + @property def t(self) -> NestedOptionI64Iterable: """ @@ -904,7 +967,8 @@ class NestedOptionEventTimeIterable(object): NestedOptionI64Iterable: Nested iterable of millisecond timestamps since the Unix epoch for each EventTime, if available. """ -class NestedHistoryIterable(object): +class NestedHistoryIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -966,7 +1030,8 @@ class NestedHistoryIterable(object): NestedHistoryTimestampIterable: Iterable of iterables of HistoryTimestamp objects. """ -class EventTimeIterable(object): +class EventTimeIterable(object): + def __eq__(self, value): """Return self==value.""" @@ -994,7 +1059,9 @@ class EventTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + @property def dt(self) -> ResultUtcDateTimeIterable: """ @@ -1016,8 +1083,12 @@ class EventTimeIterable(object): UsizeIterable: Iterable of event ids associated to each EventTime. """ - def max(self): ... - def min(self): ... + def max(self): + ... + + def min(self): + ... + @property def t(self) -> I64Iterable: """ @@ -1027,7 +1098,8 @@ class EventTimeIterable(object): I64Iterable: Iterable of millisecond timestamps since the Unix epoch for each EventTime. """ -class OptionEventTimeIterable(object): +class OptionEventTimeIterable(object): + def __eq__(self, value): """Return self==value.""" @@ -1055,7 +1127,9 @@ class OptionEventTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + @property def dt(self) -> ResultOptionUtcDateTimeIterable: """ @@ -1077,8 +1151,12 @@ class OptionEventTimeIterable(object): OptionUsizeIterable: Iterable of event ids associated to each EventTime, if available. """ - def max(self): ... - def min(self): ... + def max(self): + ... + + def min(self): + ... + @property def t(self) -> OptionI64Iterable: """ @@ -1088,7 +1166,8 @@ class OptionEventTimeIterable(object): OptionI64Iterable: Iterable of millisecond timestamps since the Unix epoch for each EventTime, if available. """ -class HistoryIterable(object): +class HistoryIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1150,7 +1229,8 @@ class HistoryIterable(object): HistoryTimestampIterable: Iterable of HistoryTimestamp objects, one for each item. """ -class HistoryTimestampIterable(object): +class HistoryTimestampIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1176,7 +1256,8 @@ class HistoryTimestampIterable(object): list[list[int]]: List of timestamps in milliseconds per history. """ -class IntervalsIterable(object): +class IntervalsIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1202,7 +1283,8 @@ class IntervalsIterable(object): list[list[int]]: List of intervals per history. """ -class HistoryEventIdIterable(object): +class HistoryEventIdIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1228,7 +1310,8 @@ class HistoryEventIdIterable(object): list[list[int]]: List of event ids per history. """ -class HistoryDateTimeIterable(object): +class HistoryDateTimeIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1249,7 +1332,8 @@ class HistoryDateTimeIterable(object): TimeError: If a timestamp cannot be converted to a datetime. """ -class OptionUsizeIterable(object): +class OptionUsizeIterable(object): + def __eq__(self, value): """Return self==value.""" @@ -1277,11 +1361,17 @@ class OptionUsizeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class ResultOptionUtcDateTimeIterable(object): -class ResultOptionUtcDateTimeIterable(object): def __iter__(self): """Implement iter(self).""" @@ -1291,9 +1381,11 @@ class ResultOptionUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class I64Iterable(object): -class I64Iterable(object): def __eq__(self, value): """Return self==value.""" @@ -1321,13 +1413,23 @@ class I64Iterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def mean(self): ... - def min(self): ... - def sum(self): ... + def collect(self): + ... + + def max(self): + ... + + def mean(self): + ... + + def min(self): + ... + + def sum(self): + ... + +class ResultUtcDateTimeIterable(object): -class ResultUtcDateTimeIterable(object): def __iter__(self): """Implement iter(self).""" @@ -1337,9 +1439,11 @@ class ResultUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedHistoryTimestampIterable(object): -class NestedHistoryTimestampIterable(object): def __iter__(self): """Implement iter(self).""" @@ -1381,7 +1485,8 @@ class NestedHistoryTimestampIterable(object): list[list[list[int]]]: List of timestamps in milliseconds per nested history. """ -class NestedIntervalsIterable(object): +class NestedIntervalsIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1423,7 +1528,8 @@ class NestedIntervalsIterable(object): list[list[list[int]]]: List of intervals per nested history. """ -class NestedHistoryEventIdIterable(object): +class NestedHistoryEventIdIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1465,7 +1571,8 @@ class NestedHistoryEventIdIterable(object): list[list[list[int]]]: List of event ids per nested history. """ -class NestedHistoryDateTimeIterable(object): +class NestedHistoryDateTimeIterable(object): + def __iter__(self): """Implement iter(self).""" @@ -1497,7 +1604,8 @@ class NestedHistoryDateTimeIterable(object): TimeError: If a timestamp cannot be converted to a datetime. """ -class NestedOptionUsizeIterable(object): +class NestedOptionUsizeIterable(object): + def __eq__(self, value): """Return self==value.""" @@ -1525,11 +1633,17 @@ class NestedOptionUsizeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def min(self): ... + def collect(self): + ... + + def max(self): + ... + + def min(self): + ... + +class NestedResultOptionUtcDateTimeIterable(object): -class NestedResultOptionUtcDateTimeIterable(object): def __iter__(self): """Implement iter(self).""" @@ -1539,9 +1653,11 @@ class NestedResultOptionUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... + +class NestedI64Iterable(object): -class NestedI64Iterable(object): def __eq__(self, value): """Return self==value.""" @@ -1569,13 +1685,23 @@ class NestedI64Iterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def mean(self): ... - def min(self): ... - def sum(self): ... + def collect(self): + ... + + def max(self): + ... + + def mean(self): + ... + + def min(self): + ... + + def sum(self): + ... + +class NestedResultUtcDateTimeIterable(object): -class NestedResultUtcDateTimeIterable(object): def __iter__(self): """Implement iter(self).""" @@ -1585,4 +1711,5 @@ class NestedResultUtcDateTimeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... + def collect(self): + ... diff --git a/python/python/raphtory/node_state/__init__.pyi b/python/python/raphtory/node_state/__init__.pyi index 3fc06a5864..358a56e8a2 100644 --- a/python/python/raphtory/node_state/__init__.pyi +++ b/python/python/raphtory/node_state/__init__.pyi @@ -20,63 +20,15 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = [ - "NodeGroups", - "DegreeView", - "NodeStateUsize", - "NodeStateOptionUsize", - "NodeStateU64", - "NodeStateOptionI64", - "NodeStateOptionEventTime", - "NodeStateOptionDateTime", - "IdView", - "NodeStateGID", - "EarliestTimeView", - "EarliestTimestampView", - "EarliestEventIdView", - "EarliestDateTimeView", - "LatestTimeView", - "LatestTimestampView", - "LatestEventIdView", - "LatestDateTimeView", - "NameView", - "NodeStateString", - "HistoryView", - "HistoryTimestampView", - "HistoryDateTimeView", - "HistoryEventIdView", - "IntervalsView", - "IntervalsFloatView", - "IntervalsIntegerView", - "EdgeHistoryCountView", - "UsizeIterable", - "NodeTypeView", - "NodeStateOptionStr", - "NodeStateListDateTime", - "NodeStateWeightedSP", - "NodeStateF64", - "NodeStateOptionF64", - "NodeStateNodes", - "NodeStateReachability", - "NodeStateListF64", - "NodeStateMotifs", - "NodeStateHits", - "NodeStateHistory", - "NodeStateHistoryTimestamp", - "NodeStateHistoryDateTime", - "NodeStateHistoryEventId", - "NodeStateIntervals", - "NodeStateSEIR", - "NodeLayout", - "NodeStateF64String", -] - -class NodeGroups(object): +__all__ = ['NodeGroups', 'DegreeView', 'NodeStateUsize', 'NodeStateOptionUsize', 'NodeStateU64', 'NodeStateOptionI64', 'NodeStateOptionEventTime', 'NodeStateOptionDateTime', 'IdView', 'NodeStateGID', 'EarliestTimeView', 'EarliestTimestampView', 'EarliestEventIdView', 'EarliestDateTimeView', 'LatestTimeView', 'LatestTimestampView', 'LatestEventIdView', 'LatestDateTimeView', 'NameView', 'NodeStateString', 'HistoryView', 'HistoryTimestampView', 'HistoryDateTimeView', 'HistoryEventIdView', 'IntervalsView', 'IntervalsFloatView', 'IntervalsIntegerView', 'EdgeHistoryCountView', 'UsizeIterable', 'NodeTypeView', 'NodeStateOptionStr', 'NodeStateListDateTime', 'NodeStateWeightedSP', 'NodeStateF64', 'NodeStateOptionF64', 'NodeStateNodes', 'NodeStateReachability', 'NodeStateListF64', 'NodeStateMotifs', 'NodeStateHits', 'NodeStateHistory', 'NodeStateHistoryTimestamp', 'NodeStateHistoryDateTime', 'NodeStateHistoryEventId', 'NodeStateIntervals', 'NodeStateSEIR', 'NodeLayout', 'NodeStateF64String'] +class NodeGroups(object): + def __bool__(self): """True if self else False""" @@ -119,7 +71,7 @@ class NodeGroups(object): Iterator[Tuple[Any, GraphView]]: Iterator over subgraphs with corresponding value """ -class DegreeView(object): +class DegreeView(object): """A lazy view over node values""" def __eq__(self, value): @@ -272,9 +224,7 @@ class DegreeView(object): DegreeView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -429,12 +379,7 @@ class DegreeView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -614,7 +559,8 @@ class DegreeView(object): Optional[int]: """ -class NodeStateUsize(object): +class NodeStateUsize(object): + def __eq__(self, value): """Return self==value.""" @@ -807,7 +753,8 @@ class NodeStateUsize(object): Iterator[int]: Iterator over values """ -class NodeStateOptionUsize(object): +class NodeStateOptionUsize(object): + def __eq__(self, value): """Return self==value.""" @@ -849,9 +796,7 @@ class NodeStateOptionUsize(object): NodeStateOptionUsize: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -985,7 +930,8 @@ class NodeStateOptionUsize(object): Iterator[Optional[int]]: Iterator over values """ -class NodeStateU64(object): +class NodeStateU64(object): + def __eq__(self, value): """Return self==value.""" @@ -1170,7 +1116,8 @@ class NodeStateU64(object): Iterator[int]: Iterator over values """ -class NodeStateOptionI64(object): +class NodeStateOptionI64(object): + def __eq__(self, value): """Return self==value.""" @@ -1212,9 +1159,7 @@ class NodeStateOptionI64(object): NodeStateOptionI64: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -1348,7 +1293,8 @@ class NodeStateOptionI64(object): Iterator[Optional[int]]: Iterator over values """ -class NodeStateOptionEventTime(object): +class NodeStateOptionEventTime(object): + def __eq__(self, value): """Return self==value.""" @@ -1390,9 +1336,7 @@ class NodeStateOptionEventTime(object): NodeStateOptionEventTime: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Optional[EventTime]] = None - ) -> Optional[Optional[EventTime]]: + def get(self, node: NodeInput, default: Optional[Optional[EventTime]] = None) -> Optional[Optional[EventTime]]: """ Get value for node @@ -1526,7 +1470,8 @@ class NodeStateOptionEventTime(object): Iterator[Optional[EventTime]]: Iterator over values """ -class NodeStateOptionDateTime(object): +class NodeStateOptionDateTime(object): + def __eq__(self, value): """Return self==value.""" @@ -1568,9 +1513,7 @@ class NodeStateOptionDateTime(object): NodeStateOptionDateTime: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Optional[datetime]] = None - ) -> Optional[Optional[datetime]]: + def get(self, node: NodeInput, default: Optional[Optional[datetime]] = None) -> Optional[Optional[datetime]]: """ Get value for node @@ -1704,7 +1647,7 @@ class NodeStateOptionDateTime(object): Iterator[Optional[datetime]]: Iterator over values """ -class IdView(object): +class IdView(object): """A lazy view over node values""" def __eq__(self, value): @@ -1890,7 +1833,8 @@ class IdView(object): Iterator[GID]: Iterator over values """ -class NodeStateGID(object): +class NodeStateGID(object): + def __eq__(self, value): """Return self==value.""" @@ -2058,7 +2002,7 @@ class NodeStateGID(object): Iterator[GID]: Iterator over values """ -class EarliestTimeView(object): +class EarliestTimeView(object): """A lazy view over node values""" def __eq__(self, value): @@ -2229,9 +2173,7 @@ class EarliestTimeView(object): EarliestTimeView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -2250,9 +2192,7 @@ class EarliestTimeView(object): WindowSet: A `WindowSet` object. """ - def get( - self, node: NodeInput, default: Optional[Optional[EventTime]] = None - ) -> Optional[Optional[EventTime]]: + def get(self, node: NodeInput, default: Optional[Optional[EventTime]] = None) -> Optional[Optional[EventTime]]: """ Get value for node @@ -2380,12 +2320,7 @@ class EarliestTimeView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -2565,7 +2500,7 @@ class EarliestTimeView(object): Optional[int]: """ -class EarliestTimestampView(object): +class EarliestTimestampView(object): """A lazy view over node values""" def __eq__(self, value): @@ -2718,9 +2653,7 @@ class EarliestTimestampView(object): EarliestTimestampView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -2739,9 +2672,7 @@ class EarliestTimestampView(object): WindowSet: A `WindowSet` object. """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -2869,12 +2800,7 @@ class EarliestTimestampView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -3045,7 +2971,7 @@ class EarliestTimestampView(object): Optional[int]: """ -class EarliestEventIdView(object): +class EarliestEventIdView(object): """A lazy view over node values""" def __eq__(self, value): @@ -3198,9 +3124,7 @@ class EarliestEventIdView(object): EarliestEventIdView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -3219,9 +3143,7 @@ class EarliestEventIdView(object): WindowSet: A `WindowSet` object. """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -3349,12 +3271,7 @@ class EarliestEventIdView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -3525,7 +3442,7 @@ class EarliestEventIdView(object): Optional[int]: """ -class EarliestDateTimeView(object): +class EarliestDateTimeView(object): """A lazy view over EarliestDateTime values for each node.""" def __eq__(self, value): @@ -3694,9 +3611,7 @@ class EarliestDateTimeView(object): EarliestDateTimeView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -3858,12 +3773,7 @@ class EarliestDateTimeView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -4050,7 +3960,7 @@ class EarliestDateTimeView(object): Optional[int]: """ -class LatestTimeView(object): +class LatestTimeView(object): """A lazy view over node values""" def __eq__(self, value): @@ -4221,9 +4131,7 @@ class LatestTimeView(object): LatestTimeView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -4242,9 +4150,7 @@ class LatestTimeView(object): WindowSet: A `WindowSet` object. """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -4372,12 +4278,7 @@ class LatestTimeView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -4557,7 +4458,7 @@ class LatestTimeView(object): Optional[int]: """ -class LatestTimestampView(object): +class LatestTimestampView(object): """A lazy view over node values""" def __eq__(self, value): @@ -4710,9 +4611,7 @@ class LatestTimestampView(object): LatestTimestampView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -4731,9 +4630,7 @@ class LatestTimestampView(object): WindowSet: A `WindowSet` object. """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -4861,12 +4758,7 @@ class LatestTimestampView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -5037,7 +4929,7 @@ class LatestTimestampView(object): Optional[int]: """ -class LatestEventIdView(object): +class LatestEventIdView(object): """A lazy view over node values""" def __eq__(self, value): @@ -5190,9 +5082,7 @@ class LatestEventIdView(object): LatestEventIdView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -5211,9 +5101,7 @@ class LatestEventIdView(object): WindowSet: A `WindowSet` object. """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -5341,12 +5229,7 @@ class LatestEventIdView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -5517,7 +5400,7 @@ class LatestEventIdView(object): Optional[int]: """ -class LatestDateTimeView(object): +class LatestDateTimeView(object): """A lazy view over EarliestDateTime values for each node.""" def __eq__(self, value): @@ -5686,9 +5569,7 @@ class LatestDateTimeView(object): LatestDateTimeView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -5707,9 +5588,7 @@ class LatestDateTimeView(object): WindowSet: A `WindowSet` object. """ - def get( - self, node: NodeInput, default: Optional[datetime] = None - ) -> Optional[datetime]: + def get(self, node: NodeInput, default: Optional[datetime] = None) -> Optional[datetime]: """ Get value for node @@ -5852,12 +5731,7 @@ class LatestDateTimeView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -6044,7 +5918,7 @@ class LatestDateTimeView(object): Optional[int]: """ -class NameView(object): +class NameView(object): """A lazy view over node values""" def __eq__(self, value): @@ -6238,7 +6112,8 @@ class NameView(object): Iterator[str]: Iterator over values """ -class NodeStateString(object): +class NodeStateString(object): + def __eq__(self, value): """Return self==value.""" @@ -6414,7 +6289,7 @@ class NodeStateString(object): Iterator[str]: Iterator over values """ -class HistoryView(object): +class HistoryView(object): """A lazy view over History objects for each node.""" def __eq__(self, value): @@ -6590,9 +6465,7 @@ class HistoryView(object): HistoryView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -6619,9 +6492,7 @@ class HistoryView(object): History: a history object containing all time entries """ - def get( - self, node: NodeInput, default: Optional[History] = None - ) -> Optional[History]: + def get(self, node: NodeInput, default: Optional[History] = None) -> Optional[History]: """ Get value for node @@ -6708,12 +6579,7 @@ class HistoryView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -6871,7 +6737,7 @@ class HistoryView(object): Optional[int]: """ -class HistoryTimestampView(object): +class HistoryTimestampView(object): """A lazy view over node values""" def __eq__(self, value): @@ -6920,9 +6786,7 @@ class HistoryTimestampView(object): NodeStateHistoryTimestamp: the computed `NodeState` """ - def get( - self, node: NodeInput, default: Optional[HistoryTimestamp] = None - ) -> Optional[HistoryTimestamp]: + def get(self, node: NodeInput, default: Optional[HistoryTimestamp] = None) -> Optional[HistoryTimestamp]: """ Get value for node @@ -6977,7 +6841,7 @@ class HistoryTimestampView(object): Iterator[HistoryTimestamp]: Iterator over values """ -class HistoryDateTimeView(object): +class HistoryDateTimeView(object): """A lazy view over node values""" def __eq__(self, value): @@ -7026,9 +6890,7 @@ class HistoryDateTimeView(object): NodeStateHistoryDateTime: the computed `NodeState` """ - def get( - self, node: NodeInput, default: Optional[HistoryDateTime] = None - ) -> Optional[HistoryDateTime]: + def get(self, node: NodeInput, default: Optional[HistoryDateTime] = None) -> Optional[HistoryDateTime]: """ Get value for node @@ -7083,7 +6945,7 @@ class HistoryDateTimeView(object): Iterator[HistoryDateTime]: Iterator over values """ -class HistoryEventIdView(object): +class HistoryEventIdView(object): """A lazy view over node values""" def __eq__(self, value): @@ -7132,9 +6994,7 @@ class HistoryEventIdView(object): NodeStateHistoryEventId: the computed `NodeState` """ - def get( - self, node: NodeInput, default: Optional[HistoryEventId] = None - ) -> Optional[HistoryEventId]: + def get(self, node: NodeInput, default: Optional[HistoryEventId] = None) -> Optional[HistoryEventId]: """ Get value for node @@ -7189,7 +7049,7 @@ class HistoryEventIdView(object): Iterator[HistoryEventId]: Iterator over values """ -class IntervalsView(object): +class IntervalsView(object): """A lazy view over node values""" def __eq__(self, value): @@ -7238,9 +7098,7 @@ class IntervalsView(object): NodeStateIntervals: the computed `NodeState` """ - def get( - self, node: NodeInput, default: Optional[Intervals] = None - ) -> Optional[Intervals]: + def get(self, node: NodeInput, default: Optional[Intervals] = None) -> Optional[Intervals]: """ Get value for node @@ -7327,7 +7185,7 @@ class IntervalsView(object): Iterator[Intervals]: Iterator over values """ -class IntervalsFloatView(object): +class IntervalsFloatView(object): """A lazy view over node values""" def __eq__(self, value): @@ -7387,9 +7245,7 @@ class IntervalsFloatView(object): NodeStateOptionF64: the computed `NodeState` """ - def get( - self, node: NodeInput, default: Optional[Optional[float]] = None - ) -> Optional[Optional[float]]: + def get(self, node: NodeInput, default: Optional[Optional[float]] = None) -> Optional[Optional[float]]: """ Get value for node @@ -7515,7 +7371,7 @@ class IntervalsFloatView(object): Iterator[Optional[float]]: Iterator over values """ -class IntervalsIntegerView(object): +class IntervalsIntegerView(object): """A lazy view over node values""" def __eq__(self, value): @@ -7575,9 +7431,7 @@ class IntervalsIntegerView(object): NodeStateOptionI64: the computed `NodeState` """ - def get( - self, node: NodeInput, default: Optional[Optional[int]] = None - ) -> Optional[Optional[int]]: + def get(self, node: NodeInput, default: Optional[Optional[int]] = None) -> Optional[Optional[int]]: """ Get value for node @@ -7703,7 +7557,7 @@ class IntervalsIntegerView(object): Iterator[Optional[int]]: Iterator over values """ -class EdgeHistoryCountView(object): +class EdgeHistoryCountView(object): """A lazy view over node values""" def __eq__(self, value): @@ -7856,9 +7710,7 @@ class EdgeHistoryCountView(object): EdgeHistoryCountView: The layered view """ - def expanding( - self, step: int | str, alignment_unit: str | None = None - ) -> WindowSet: + def expanding(self, step: int | str, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `step` size using an expanding window. @@ -8005,12 +7857,7 @@ class EdgeHistoryCountView(object): Nodes: The nodes """ - def rolling( - self, - window: int | str, - step: int | str | None = None, - alignment_unit: str | None = None, - ) -> WindowSet: + def rolling(self, window: int | str, step: int | str | None = None, alignment_unit: str | None = None) -> WindowSet: """ Creates a `WindowSet` with the given `window` size and optional `step` using a rolling window. If `alignment_unit` is not "unaligned" and a `step` larger than `window` is provided, some time entries @@ -8190,7 +8037,8 @@ class EdgeHistoryCountView(object): Optional[int]: """ -class UsizeIterable(object): +class UsizeIterable(object): + def __eq__(self, value): """Return self==value.""" @@ -8218,13 +8066,22 @@ class UsizeIterable(object): def __repr__(self): """Return repr(self).""" - def collect(self): ... - def max(self): ... - def mean(self): ... - def min(self): ... - def sum(self): ... + def collect(self): + ... + + def max(self): + ... + + def mean(self): + ... -class NodeTypeView(object): + def min(self): + ... + + def sum(self): + ... + +class NodeTypeView(object): """A lazy view over node values""" def __eq__(self, value): @@ -8284,9 +8141,7 @@ class NodeTypeView(object): NodeStateOptionStr: the computed `NodeState` """ - def get( - self, node: NodeInput, default: Optional[Optional[str]] = None - ) -> Optional[Optional[str]]: + def get(self, node: NodeInput, default: Optional[Optional[str]] = None) -> Optional[Optional[str]]: """ Get value for node @@ -8420,7 +8275,8 @@ class NodeTypeView(object): Iterator[Optional[str]]: Iterator over values """ -class NodeStateOptionStr(object): +class NodeStateOptionStr(object): + def __eq__(self, value): """Return self==value.""" @@ -8462,9 +8318,7 @@ class NodeStateOptionStr(object): NodeStateOptionStr: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Optional[str]] = None - ) -> Optional[Optional[str]]: + def get(self, node: NodeInput, default: Optional[Optional[str]] = None) -> Optional[Optional[str]]: """ Get value for node @@ -8598,7 +8452,8 @@ class NodeStateOptionStr(object): Iterator[Optional[str]]: Iterator over values """ -class NodeStateListDateTime(object): +class NodeStateListDateTime(object): + def __eq__(self, value): """Return self==value.""" @@ -8640,9 +8495,7 @@ class NodeStateListDateTime(object): NodeStateListDateTime: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[list[datetime]] = None - ) -> Optional[list[datetime]]: + def get(self, node: NodeInput, default: Optional[list[datetime]] = None) -> Optional[list[datetime]]: """ Get value for node @@ -8768,7 +8621,8 @@ class NodeStateListDateTime(object): Iterator[list[datetime]]: Iterator over values """ -class NodeStateWeightedSP(object): +class NodeStateWeightedSP(object): + def __eq__(self, value): """Return self==value.""" @@ -8799,9 +8653,7 @@ class NodeStateWeightedSP(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[Tuple[float, Nodes]] = None - ) -> Optional[Tuple[float, Nodes]]: + def get(self, node: NodeInput, default: Optional[Tuple[float, Nodes]] = None) -> Optional[Tuple[float, Nodes]]: """ Get value for node @@ -8856,7 +8708,8 @@ class NodeStateWeightedSP(object): Iterator[Tuple[float, Nodes]]: Iterator over values """ -class NodeStateF64(object): +class NodeStateF64(object): + def __eq__(self, value): """Return self==value.""" @@ -9041,7 +8894,8 @@ class NodeStateF64(object): Iterator[float]: Iterator over values """ -class NodeStateOptionF64(object): +class NodeStateOptionF64(object): + def __eq__(self, value): """Return self==value.""" @@ -9083,9 +8937,7 @@ class NodeStateOptionF64(object): NodeStateOptionF64: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Optional[float]] = None - ) -> Optional[Optional[float]]: + def get(self, node: NodeInput, default: Optional[Optional[float]] = None) -> Optional[Optional[float]]: """ Get value for node @@ -9211,7 +9063,8 @@ class NodeStateOptionF64(object): Iterator[Optional[float]]: Iterator over values """ -class NodeStateNodes(object): +class NodeStateNodes(object): + def __eq__(self, value): """Return self==value.""" @@ -9297,7 +9150,8 @@ class NodeStateNodes(object): Iterator[Nodes]: Iterator over values """ -class NodeStateReachability(object): +class NodeStateReachability(object): + def __eq__(self, value): """Return self==value.""" @@ -9328,9 +9182,7 @@ class NodeStateReachability(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[list[Tuple[int, str]]] = None - ) -> Optional[list[Tuple[int, str]]]: + def get(self, node: NodeInput, default: Optional[list[Tuple[int, str]]] = None) -> Optional[list[Tuple[int, str]]]: """ Get value for node @@ -9385,7 +9237,8 @@ class NodeStateReachability(object): Iterator[list[Tuple[int, str]]]: Iterator over values """ -class NodeStateListF64(object): +class NodeStateListF64(object): + def __eq__(self, value): """Return self==value.""" @@ -9416,9 +9269,7 @@ class NodeStateListF64(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[list[float]] = None - ) -> Optional[list[float]]: + def get(self, node: NodeInput, default: Optional[list[float]] = None) -> Optional[list[float]]: """ Get value for node @@ -9473,7 +9324,8 @@ class NodeStateListF64(object): Iterator[list[float]]: Iterator over values """ -class NodeStateMotifs(object): +class NodeStateMotifs(object): + def __eq__(self, value): """Return self==value.""" @@ -9515,9 +9367,7 @@ class NodeStateMotifs(object): NodeStateMotifs: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[list[int]] = None - ) -> Optional[list[int]]: + def get(self, node: NodeInput, default: Optional[list[int]] = None) -> Optional[list[int]]: """ Get value for node @@ -9643,7 +9493,8 @@ class NodeStateMotifs(object): Iterator[list[int]]: Iterator over values """ -class NodeStateHits(object): +class NodeStateHits(object): + def __eq__(self, value): """Return self==value.""" @@ -9685,9 +9536,7 @@ class NodeStateHits(object): NodeStateHits: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Tuple[float, float]] = None - ) -> Optional[Tuple[float, float]]: + def get(self, node: NodeInput, default: Optional[Tuple[float, float]] = None) -> Optional[Tuple[float, float]]: """ Get value for node @@ -9813,7 +9662,7 @@ class NodeStateHits(object): Iterator[Tuple[float, float]]: Iterator over values """ -class NodeStateHistory(object): +class NodeStateHistory(object): """A NodeState of History objects for each node.""" def __eq__(self, value): @@ -9888,9 +9737,7 @@ class NodeStateHistory(object): History: A history object containing all time entries. """ - def get( - self, node: NodeInput, default: Optional[History] = None - ) -> Optional[History]: + def get(self, node: NodeInput, default: Optional[History] = None) -> Optional[History]: """ Get History object for the node. @@ -9971,7 +9818,8 @@ class NodeStateHistory(object): Iterator[History]: Iterator over History objects. """ -class NodeStateHistoryTimestamp(object): +class NodeStateHistoryTimestamp(object): + def __eq__(self, value): """Return self==value.""" @@ -10002,9 +9850,7 @@ class NodeStateHistoryTimestamp(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[HistoryTimestamp] = None - ) -> Optional[HistoryTimestamp]: + def get(self, node: NodeInput, default: Optional[HistoryTimestamp] = None) -> Optional[HistoryTimestamp]: """ Get value for node @@ -10059,7 +9905,8 @@ class NodeStateHistoryTimestamp(object): Iterator[HistoryTimestamp]: Iterator over values """ -class NodeStateHistoryDateTime(object): +class NodeStateHistoryDateTime(object): + def __eq__(self, value): """Return self==value.""" @@ -10090,9 +9937,7 @@ class NodeStateHistoryDateTime(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[HistoryDateTime] = None - ) -> Optional[HistoryDateTime]: + def get(self, node: NodeInput, default: Optional[HistoryDateTime] = None) -> Optional[HistoryDateTime]: """ Get value for node @@ -10147,7 +9992,8 @@ class NodeStateHistoryDateTime(object): Iterator[HistoryDateTime]: Iterator over values """ -class NodeStateHistoryEventId(object): +class NodeStateHistoryEventId(object): + def __eq__(self, value): """Return self==value.""" @@ -10178,9 +10024,7 @@ class NodeStateHistoryEventId(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[HistoryEventId] = None - ) -> Optional[HistoryEventId]: + def get(self, node: NodeInput, default: Optional[HistoryEventId] = None) -> Optional[HistoryEventId]: """ Get value for node @@ -10235,7 +10079,8 @@ class NodeStateHistoryEventId(object): Iterator[HistoryEventId]: Iterator over values """ -class NodeStateIntervals(object): +class NodeStateIntervals(object): + def __eq__(self, value): """Return self==value.""" @@ -10266,9 +10111,7 @@ class NodeStateIntervals(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[Intervals] = None - ) -> Optional[Intervals]: + def get(self, node: NodeInput, default: Optional[Intervals] = None) -> Optional[Intervals]: """ Get value for node @@ -10363,7 +10206,8 @@ class NodeStateIntervals(object): Iterator[Intervals]: Iterator over values """ -class NodeStateSEIR(object): +class NodeStateSEIR(object): + def __eq__(self, value): """Return self==value.""" @@ -10405,9 +10249,7 @@ class NodeStateSEIR(object): NodeStateSEIR: The k smallest values as a node state """ - def get( - self, node: NodeInput, default: Optional[Infected] = None - ) -> Optional[Infected]: + def get(self, node: NodeInput, default: Optional[Infected] = None) -> Optional[Infected]: """ Get value for node @@ -10533,7 +10375,8 @@ class NodeStateSEIR(object): Iterator[Infected]: Iterator over values """ -class NodeLayout(object): +class NodeLayout(object): + def __eq__(self, value): """Return self==value.""" @@ -10564,9 +10407,7 @@ class NodeLayout(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[list[float]] = None - ) -> Optional[list[float]]: + def get(self, node: NodeInput, default: Optional[list[float]] = None) -> Optional[list[float]]: """ Get value for node @@ -10621,7 +10462,8 @@ class NodeLayout(object): Iterator[list[float]]: Iterator over values """ -class NodeStateF64String(object): +class NodeStateF64String(object): + def __eq__(self, value): """Return self==value.""" @@ -10652,9 +10494,7 @@ class NodeStateF64String(object): def __repr__(self): """Return repr(self).""" - def get( - self, node: NodeInput, default: Optional[Tuple[float, str]] = None - ) -> Optional[Tuple[float, str]]: + def get(self, node: NodeInput, default: Optional[Tuple[float, str]] = None) -> Optional[Tuple[float, str]]: """ Get value for node diff --git a/python/python/raphtory/vectors/__init__.pyi b/python/python/raphtory/vectors/__init__.pyi index c9011c1357..4b88acf829 100644 --- a/python/python/raphtory/vectors/__init__.pyi +++ b/python/python/raphtory/vectors/__init__.pyi @@ -20,22 +20,17 @@ import numpy as np from numpy.typing import NDArray from datetime import datetime from pandas import DataFrame +from pyarrow import DataType from os import PathLike import networkx as nx # type: ignore import pyvis # type: ignore from raphtory.iterables import * -__all__ = ["VectorisedGraph", "Document", "Embedding", "VectorSelection"] - -class VectorisedGraph(object): +__all__ = ['VectorisedGraph', 'Document', 'Embedding', 'VectorSelection'] +class VectorisedGraph(object): """VectorisedGraph object that contains embedded documents that correspond to graph entities.""" - def edges_by_similarity( - self, - query: str | list, - limit: int, - window: Optional[Tuple[int | str, int | str]] = None, - ) -> VectorSelection: + def edges_by_similarity(self, query: str | list, limit: int, window: Optional[Tuple[int | str, int | str]] = None) -> VectorSelection: """ Perform a similarity search between each edge's associated document and a specified `query`. Returns a number of edges up to a specified `limit` ranked in descending order of similarity score. @@ -51,12 +46,7 @@ class VectorisedGraph(object): def empty_selection(self): """Return an empty selection of entities.""" - def entities_by_similarity( - self, - query: str | list, - limit: int, - window: Optional[Tuple[int | str, int | str]] = None, - ) -> VectorSelection: + def entities_by_similarity(self, query: str | list, limit: int, window: Optional[Tuple[int | str, int | str]] = None) -> VectorSelection: """ Perform a similarity search between each entity's associated document and a specified `query`. Returns a number of entities up to a specified `limit` ranked in descending order of similarity score. @@ -69,12 +59,7 @@ class VectorisedGraph(object): VectorSelection: The vector selection resulting from the search. """ - def nodes_by_similarity( - self, - query: str | list, - limit: int, - window: Optional[Tuple[int | str, int | str]] = None, - ) -> VectorSelection: + def nodes_by_similarity(self, query: str | list, limit: int, window: Optional[Tuple[int | str, int | str]] = None) -> VectorSelection: """ Perform a similarity search between each node's associated document and a specified `query`. Returns a number of nodes up to a specified `limit` ranked in descending order of similarity score. @@ -87,7 +72,7 @@ class VectorisedGraph(object): VectorSelection: The vector selection resulting from the search. """ -class Document(object): +class Document(object): """A document corresponding to a graph entity. Used to generate embeddings.""" def __repr__(self): @@ -120,11 +105,13 @@ class Document(object): Optional[Any]: """ -class Embedding(object): +class Embedding(object): + def __repr__(self): """Return repr(self).""" -class VectorSelection(object): +class VectorSelection(object): + def add_edges(self, edges: list) -> None: """ Add all the documents associated with the specified `edges` to the current selection. @@ -170,9 +157,7 @@ class VectorSelection(object): list[Edge]: List of edges in the current selection. """ - def expand( - self, hops: int, window: Optional[Tuple[int | str, int | str]] = None - ) -> None: + def expand(self, hops: int, window: Optional[Tuple[int | str, int | str]] = None) -> None: """ Add all the documents a specified number of `hops` away from the selection. @@ -189,12 +174,7 @@ class VectorSelection(object): None: """ - def expand_edges_by_similarity( - self, - query: str | list, - limit: int, - window: Optional[Tuple[int | str, int | str]] = None, - ) -> None: + def expand_edges_by_similarity(self, query: str | list, limit: int, window: Optional[Tuple[int | str, int | str]] = None) -> None: """ Add the top `limit` adjacent edges with higher score for `query` to the selection @@ -209,12 +189,7 @@ class VectorSelection(object): None: """ - def expand_entities_by_similarity( - self, - query: str | list, - limit: int, - window: Optional[Tuple[int | str, int | str]] = None, - ) -> None: + def expand_entities_by_similarity(self, query: str | list, limit: int, window: Optional[Tuple[int | str, int | str]] = None) -> None: """ Add the top `limit` adjacent entities with higher score for `query` to the selection @@ -237,12 +212,7 @@ class VectorSelection(object): None: """ - def expand_nodes_by_similarity( - self, - query: str | list, - limit: int, - window: Optional[Tuple[int | str, int | str]] = None, - ) -> None: + def expand_nodes_by_similarity(self, query: str | list, limit: int, window: Optional[Tuple[int | str, int | str]] = None) -> None: """ Add the top `limit` adjacent nodes with higher score for `query` to the selection diff --git a/python/scripts/gen-stubs.py b/python/scripts/gen-stubs.py index ee83330014..b973b596ae 100755 --- a/python/scripts/gen-stubs.py +++ b/python/scripts/gen-stubs.py @@ -15,6 +15,7 @@ "from numpy.typing import NDArray", "from datetime import datetime", "from pandas import DataFrame", + "from pyarrow import DataType", "from os import PathLike", "import networkx as nx # type: ignore", "import pyvis # type: ignore", diff --git a/python/tests/data/btc_dataset/csv_directory/part1.csv b/python/tests/data/btc_dataset/csv_directory/part1.csv new file mode 100644 index 0000000000..b36f39434a --- /dev/null +++ b/python/tests/data/btc_dataset/csv_directory/part1.csv @@ -0,0 +1,51 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 00:28:09,bc1q40a3a5xlwv0l4n4xs3mpvauscgkj78rcuhvrhs,bc1q9xh9exh34tq566nfvpwwupum6efh3yqc0aja5d +2025-11-10 05:47:18,bc1q2zn0548tc0njq9xxt0uamxxh48s8t7c9dw0a45,3K1t5MthuwcyNn4BWHp4eFSyTY8ifDjTkk +2025-11-10 12:17:32,bc1qxsp4nqu7zq424wdz4v7c6q8jylhtlskap86kpd,36jsyuJHY2Eqs48yjUb6xyzARSPYKKSrTF +2025-11-10 00:28:09,bc1qvpah3cjguuwmjjgwp2g9t52f786jnw7km3jgau,34WkZeqyNBb5FXMxLZJfveHNm2kqngt4w4 +2025-11-10 13:42:54,bc1q6xku4eta99267etxf84yy3qc2pyw04pq8dvqye,bc1q6xku4eta99267etxf84yy3qc2pyw04pq8dvqye +2025-11-10 13:42:54,bc1q50uxvgu5paqj22fg27s352wggev70xs9f8lhnk,bc1qxgxf44zkxrz8ld33kjvf85ekmmkcaexs7xkfyg +2025-11-10 05:47:18,bc1q5v45pufhugrqmp9a7taq92y4uqu8awpqcsrjq2,33bdcPeLq1Mk8TrHZ6sPwjpNbfHf6Y5UCg +2025-11-10 15:38:35,bc1qhkar0wasylsfhwamp8d023e2tqzcjfzjlx6wrd,bc1q4mhkgdfmattuya75rescq5u3ms4u3m56x8nzcz +2025-11-10 18:01:50,bc1qw8wrek2m7nlqldll66ajnwr9mh64syvkt67zlu,bc1qxwea59rqnldu5hl2zg3pt9c0av7eee4dfjnj7u +2025-11-10 17:59:43,bc1qn5yd54kvaa60jqw0ncx93ru7ctlk0727653ynz,bc1qehhvcqru3k3qfn95r9w3nf8jxnlflnw4d9duah +2025-11-10 15:34:09,bc1qr7c9p424qed5mqy33luxpu35wmxdl8vd383xze,bc1q26h4h4v5u9m0nk8mv9mn2pfpmcyemeuntee8xs +2025-11-10 20:29:45,bc1qsaqmj46l3jl42vczyqa34fqukdwdxwndj3f5ah,bc1qsaqmj46l3jl42vczyqa34fqukdwdxwndj3f5ah +2025-11-10 20:29:45,bc1qk4jzsk3qn6dra7re4cs4tjezhz4udr86cvgs0x,1NoeLVMHeSC9gvw1FdAnmHTNrHX9iTNXoK +2025-11-10 00:28:09,bc1qlmgaecwta0h2k3wc0yx5gfeagyj2u4kq7p525x,bc1qvrwer4355chxf9zcyd8lweaesk30r797f96v7v +2025-11-10 20:57:25,bc1q2dshq3fv3nxpu0h5t06a2drcpl89u9xvncdena,bc1q6vqedrdgh9j97jp4nhamyjqpf27rtldwr5zm07 +2025-11-10 14:59:57,bc1qf4g86wre0gmzxxh2c0p5892ne4rdrlxe28pazd,bc1qvmf3qpe3lt8asy8g86vxvjnzkn366gn2q0qyly +2025-11-10 18:01:50,bc1qex279aerh885fr62m0lnv3htvja2kw9km9c2aa,bc1qex279aerh885fr62m0lnv3htvja2kw9km9c2aa +2025-11-10 00:28:09,bc1qdj7cz87pd3st0ah6yhy5ltmv423ltd0xxycn7r,bc1qyn66t4wue704rz4r37mye65hsqypg0juq89068 +2025-11-10 14:56:34,bc1qjpdqs9fjgapv305p2w7lxh7zk49rjjrqzwqguf,3QNQ49CBeC63Z3r3C2D3s5grMVKZQTLQ1J +2025-11-10 17:23:00,bc1qxw0vxcnwta0n8q4s7apjtsdc72uqc8dt5wvraw,3Hs4uUpc2ntg7ajWjoiHBEksU5182GzDft +2025-11-10 13:42:54,3NtA69Kn6Z6w8A7eQNXyz5ijYZxnynvT7N,3QCCr2s4NHn2BR2oTKNcZPFGEf5cjt4BDP +2025-11-10 17:59:43,bc1qcpl66gg9wpyj4g9umh9ye0mtsgtzmmlmrcjpr5,bc1qdkwxe7cuq0z35tfanuavvmpxrkqlhy72esnj98 +2025-11-10 17:23:00,bc1qlrex5kmzc4kdvwjl0j7sm2pvewvwaylrnrlns7,bc1qs2en2wud3q99wq7ykwdwrwzkpxfluchycw6wvs +2025-11-10 00:28:09,bc1qhazv3u50uvv5lw83sgd2dnlt80j2fn2j39h4rz,3CFP97ZCKiy6dmv2wRuJtUUYZcGMXjzfKA +2025-11-10 17:23:00,bc1qch5udvlvm9luneywcfjxjfvy6wuy6hysku8vw7,bc1q3qsupsucu6sse2uwpc2hdysf3mu5wfl9rnexvu +2025-11-10 14:56:34,bc1qenk9rwm8phgfy0w9zl2fdsejxhxc86sm67y5we,bc1qzclu4epvxu9e6a5c2mk650fzdw4626352lyf2e +2025-11-10 15:38:35,bc1qxgjzdgp0yrh69llsgepm0nfrj2h0l9cjvk9p7w,bc1qzttelzj04e87qcvfujke7wdtfldug4zmugmwh8 +2025-11-10 14:59:57,bc1qdyr4723zc8azryex3weyspdhhdptlwurfzej5v,156kgpUXXggfZeyFdgxxs6Eouok9MscqZr +2025-11-10 20:57:25,bc1qsc5q87uem04xzedure8u90452hgx8h00y96ssa,bc1qmcx2377el75jxxg8y5673srdgj7v0jgjn4y0u2 +2025-11-10 14:59:57,1GbUhB5Aub17yjLaFf1fPNFN8icPzXn3sU,bc1qsj84dmnupc25zhecdxfhqnhtg4g63w8h0q260g +2025-11-10 20:29:45,bc1qpsys7sfk5u7ue3lffwzszzvffhtku78kr0vva4,bc1qt67jql0p02yvjncgqvfjzwnc2fwhjszm3d9jhmgsnkpxkaf3j6sq6cdn7c +2025-11-10 18:01:50,1NKKRqJDwJjdnYL1Wn7QsUjxdDxqJAL6vG,3GEeyC3sDSDQYMnsfYdMUJGXpGoXnz1Q5B +2025-11-10 18:01:50,bc1q77flh60ngm58qzpuhcvuxtzp2z6qc3ww52udm3,12tPKMdmLCReFCjLiif7bng5VX8G7WTHjw +2025-11-10 15:34:09,bc1qr7whs7crqjpqshv7qk9ex4742wagf9gnznjulc,bc1qehv33h4ez6drn55yqwa0v8mwzpzcgqsu2gkejyy3e24m4w2az2jq0mcke9 +2025-11-10 14:12:11,1N6iknsjUx9R9JiarC136TVCJzUSpjr8V5,37exsSwT2HfBqgeM68kzKXvG4aJXW6ERnE +2025-11-10 14:12:11,bc1qtz6tnlusht3la6p6gnjfgd3ad6zneuqk64k5j8,1HcmTmdkbmVbuof1vurum74PMcmC5Lf7PM +2025-11-10 05:47:18,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1qq4e2xm40vgmz79rrh4l3zrrf7eg96l2q3tw4kv +2025-11-10 14:59:57,bc1q9r8pgy2xquxj4ph8gjatle587dsvqwjrmyflx4,bc1q0prfw6u83g4pj9vyzpzgxpq5u4xtn3kj8ltg2a +2025-11-10 13:42:54,bc1qemdd6pq7qar7hs33jvvsv4tlt5edasxk996hda,39ohXNyW4mXo652kA5CzsH87HDwuDmmNuY +2025-11-10 15:34:09,bc1qmzhda5986ssuf45x2qzymtuxlc55qs7ddds7ae,bc1qwujtjmpk7eavveax6e2crygzsz59pl7qnudp8a +2025-11-10 14:56:34,bc1qzkunkrkz6q5ut6wwcjktn3fshgshx686d0xqtg,bc1qpzuj9jl3gxarhdt6rzxynk0cs9a7vw0acp5chf +2025-11-10 20:29:45,bc1qnlp6y2cxtnsm4u6xfn6yjvkvjp4u8x08fsp4yq,bc1qkc2quddu4nutdn7p0gcz4kpenex6ukxqjmxj62 +2025-11-10 14:56:34,bc1ph6td9w6mfvj0s00vj3240pclurgmeenw4a2c84vppnswufwmp5mspmp28a,3MP4GcjaoPcFunef59ZxtjNkwb3gEw2ZPS +2025-11-10 12:17:32,12RVBQfdCa4ctMCFrwPGKz1iLSs5wwdyoV,bc1q2nalwjxe947fe4zl4rmd6g97vfrqn3af5ema38 +2025-11-10 20:29:45,bc1qqf89nhg5pkrmzafsranjn6rku5nv484vtagfuy,bc1q8pkcam0ut9e3c2azzj5292q4tdszztqn7mrxfq +2025-11-10 17:59:43,bc1qns9f7yfx3ry9lj6yz7c9er0vwa0ye2eklpzqfw,bc1q8gelljdlq5dyg20hsacga6my0k462k0muewp64 +2025-11-10 12:17:32,bc1q2sc0phtfha7n50sd250g6gpx7w36y6hr75cdd9,bc1qvdsw7yv7e6pz0p8pxfweqhu0qhj89tvyc0wv4j +2025-11-10 20:29:45,bc1pw86pslqfvc6hhxalwjcsdp24te6hnyfxhvckkf8fjmf3wlp42vwq9yqwtt,bc1pykfp9h5x7umf3ksrgnff44k5yen7kps5gz2ee8znkltp7udth84sq37w06 +2025-11-10 05:47:18,37jdMXYbvg3dKzJ4pGSYiABiXoBy4putZq,bc1qrs2d75werh8nmvr4makktpyuu4l7uex9yr05ja +2025-11-10 15:38:35,bc1qep5xck85tsry58d36xxvdpwwh3s4vm04v7t4cw,bc1q95lyl6wtxs5zt7dna9qwjfvg3k84n5z87fuyuc diff --git a/python/tests/data/btc_dataset/csv_directory/part2.csv b/python/tests/data/btc_dataset/csv_directory/part2.csv new file mode 100644 index 0000000000..b606b24e44 --- /dev/null +++ b/python/tests/data/btc_dataset/csv_directory/part2.csv @@ -0,0 +1,51 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 17:59:43,bc1qthv7smt28rt2ctyg5ec3vkvx0hl4ckks9m75c3,bc1q2g6nxqvv2s09plap4x4j6pgnc2y42lmjy8fpxf +2025-11-10 20:29:45,bc1qt9nvg8mwuqmjlruc8z9m4kns2hd3gk2ylsdryp,3JDMdeEtsEfnABR6AYaHNU8fy9B7uDJduH +2025-11-10 12:17:32,bc1q4996ykxqxnhey7sh9fzvn3rre3mswta69myft7,bc1qlxsa2yndh2xt4rpmw5pt7eqaelkzd54psvx4xa +2025-11-10 05:47:18,bc1q449ruh9ga8pzd4uzds83zqckwp6yqxfvra7fvq645ev7sqefj9jsdkpegd,bc1p5xpw59psu6yyzmee8s8ptqy6gvpfls0zsegp5krw0x6dp2mtru2qcxq0tk +2025-11-10 12:17:32,3FXf66gb6NNMA52WSBBjJyMmJ3t75hLRDM,1EHF3NhZPLWEaEppSHqK5AVGB8zX5xTXEm +2025-11-10 17:23:00,bc1q4pmrg0q6ywlsd4vutmmv65g7n4mgp7tf2hz9gx,3Hqj4K5kofSnRa8MjJPt43yw5gd5f5sdKG +2025-11-10 14:59:57,bc1q3aus5ka7pq2wcjuesxqlaqwhxgckxxlx6t6ejy,bc1qwl8jvjvkyeun7wj09xkcpwt4g7sqajlzush8j2 +2025-11-10 15:38:35,bc1qh2cwxv899psevx5yhr9cvuvqpcwqr3ej4ns87a,bc1qkuwtp76a89y40nvp6235xh9u5rpumv0arpm8rq +2025-11-10 14:12:11,bc1qa7a3ndesd0n2srxzdhnkksu0hkc96rp5uyyq95,bc1q3e0xr3pqz0rad4uakctg7j2crnf3v53na9c567 +2025-11-10 14:12:11,bc1qvxs7r6w2z3aguqzdecql7t5vkr44q9gccgetyn,bc1q0qdfdjjp9w872r406tlff34tdy9wa7wvdgutd9 +2025-11-10 14:56:34,bc1qsw6zezpduq9985vxs48uhffv82jzxgl8rm5mpd,bc1qsw6zezpduq9985vxs48uhffv82jzxgl8rm5mpd +2025-11-10 05:47:18,bc1q2alxcv39eku3cvctdudkqf2y2fvqu2t49wlrj2,bc1q2alxcv39eku3cvctdudkqf2y2fvqu2t49wlrj2 +2025-11-10 12:17:32,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1qkqd2ewnd8mw358qw24vmwgepz8n8cta0y2pe97 +2025-11-10 05:47:18,1AGD42b91ycvAcMrCCQ1G5iiBZooPP9xqX,1AGD42b91ycvAcMrCCQ1G5iiBZooPP9xqX +2025-11-10 20:57:25,bc1qmv76fxychuzcuvt3uy5uy3seu6jvxrsfz4993w,14s7DCukciJLAFJsYDzPK7ZfFP2thzU1Yt +2025-11-10 14:56:34,35C2L1pCgwzBHNcDcVL1a5RuoefeWqyjAR,bc1qhfsad8wuvke9eczj5g4e287hz93g7t8nwn9gxj +2025-11-10 15:34:09,3HYjQZaytMNixSZwU6Dkd7QBQhs2yFcp3E,14BuunTtFx9HMvgJzJQnxR5VNgjXCKckSv +2025-11-10 21:05:57,bc1qx54ge94tzqtfjzqy0dm9z9q6yqdm75v3ed8g39,166dELRZqWwRjXXVdbahWLi7muS9A2jjJv +2025-11-10 14:59:57,bc1pthxvxwdparr9pg5s9dxl4s4u3f744qwn9atl3gcltvjx76hyg4kqx230gh,bc1pthxvxwdparr9pg5s9dxl4s4u3f744qwn9atl3gcltvjx76hyg4kqx230gh +2025-11-10 14:59:57,38Vfgc9RT5EptuLMQTrnJ65eZgQtDpneom,bc1qeq2flx8mfj3gn3j58uwaj5u2xzpt8qmvet46nl +2025-11-10 14:12:11,bc1q7cyrfmck2ffu2ud3rn5l5a8yv6f0chkp0zpemf,bc1qx0m6mzl4756vwg23jxkdpew03wwfze856phhxl +2025-11-10 17:23:00,bc1qkqw4qj5gplgkns6ysg6xd6986a4xrwpg9fsy36,bc1q9stxzl5x02rrq0cfmlfh4epumn6zvq6asa9n0u +2025-11-10 00:28:09,bc1q6nlqnu6jad5fygq7v5kua2zpr2h5x288szq5km,35AtUZgvWh9uAHhX6fMidJcbVTJTjC4Mcs +2025-11-10 14:56:34,bc1qm2safrrv8e6696sgtksv36yg066qcafc39mu3y,34Yq1C3TS1V5S8w3CNx3SN2W3CdjoGu9QN +2025-11-10 14:12:11,1GbUhB5Aub17yjLaFf1fPNFN8icPzXn3sU,bc1qw3qld7muf6feg5jlypcdfrmq4hzanx92xagswe +2025-11-10 05:47:18,1NfJSiqBw4fb74KgVQrPsk5W5aqitAD1Xv,1L6yJi7TcjztgX7W8ds1zz2NGEnT7GdoAz +2025-11-10 05:47:18,bc1qd9gg9qhkswtp26kujvc29zc6daj4yzv6qsgur3,3MqsziNKsuFwM9unjhAsKYZCQ7mNfYTSv5 +2025-11-10 20:57:25,bc1qdvdt8vjn4lpep3hr6kr90uwmjr7rf906ac5lxn,bc1qqz5ll7nwghwjg9e9d0wjpx54cs5rc6uu48emtf +2025-11-10 12:17:32,37jdMXYbvg3dKzJ4pGSYiABiXoBy4putZq,19WpQ6KYi2SGKRnsgkeX5ALgZjgSMPiqmu +2025-11-10 15:34:09,bc1qz9t7rmkf3wn3yuz48e0fhg8dvzf7hsmf3a6at9,3R1Ku4KBj6y9ekKkCJXnrVkUzGNjZtEXah +2025-11-10 15:38:35,bc1qmkavqx59gg4aucmctctww3nve6x9keln9s07zz,1HzijFYeyDDUKym1siU5hBiuyu9WVRPcVj +2025-11-10 20:57:25,bc1qkplg5aqltwln2ks4shezddemdffh9pmt73xnpd,bc1qgx3c07j5enqz0pfwpkchpx6cwh690zst9l4dz0 +2025-11-10 00:28:09,bc1qt2xxqsy0tnvcvml47t4ugrvm8p8h9skkv886uv,bc1q2sa6jadt6gry89778csh2lmu2xaw2javvq9srn +2025-11-10 20:57:25,bc1qlpygchhl3j07mh9z7gcsqzjfapyyurm9amvhh8,bc1q0z4v89tecuy5e5cr3hdj6ts8zz0ky79mmattsv +2025-11-10 21:05:57,15QKr87rKdX8g7kmJ2DBWEgbWiGrnxBTnM,3Ae1fKTuLvPNTqjAsuXznf6cFySEtcArho +2025-11-10 15:34:09,bc1q2z994nye47fnwxxy4nwukfg9kkq0m5xe7hxpq2,bc1q4saygf3hk4cl0hej8e5rr2wpdza8zga6fqshqt8mrlzgpt348chqgs532d +2025-11-10 20:57:25,bc1qagt6ng896jhghzuxhqzrcmkn0nq7tzpjejghgz,bc1q3rqkcktkqnem5yfzmrzya289m2xjl9vl37fgmxdwkhl3n7f9l35qk2u36p +2025-11-10 00:28:09,bc1qwthqxlv39qwxt6j5c5zdhxprjfkpy4qgre6nvl,bc1qc3h9tdncgalkv3yeaw5fxsn0ktdavxmvzmcm09ge4k59psxxx60qmwcmx6 +2025-11-10 20:57:25,bc1qlll42hhmtn7pwz6srexps6v8zm2tqp2p7tx4pt,bc1qzheza7hkc4jyp67vhtgrkaxpddwl3z0eada6fe +2025-11-10 14:12:11,bc1pah7lzc5rcms23lvnsrsj68atcagcg25j6kzlk7aytjtludwrl5lqclvw3t,bc1pmspgjsaxqfdvzkg7du7sgpclldwf0fajrkn36qgkrat99ugjz2ushjvq57 +2025-11-10 18:01:50,bc1qmgv6leqa0rpu40ycpndxg9hggrqdxcdgtmvp34,18N8SP9Ui6WKLtDWks6DfwMMMeJT4W5TDB +2025-11-10 17:23:00,bc1q65h44f9taql5ew5nzd2xzkmq35wcag2uugvj82,bc1qhzvug7trqm7n36g5azdqa968rl9ww5rvwzt69w +2025-11-10 20:57:25,bc1qrgl0yu3zuglvcgrdsglc5a8zdsud6n023naqaz,bc1qlg7shfvat4p0fg0n9rzap3qxryu45sljuxy4qp +2025-11-10 14:56:34,bc1q4dd592zay7mpw9arzj02pr4hr6e5yj603r29sy,bc1qepmkx8n0flmw3qq9puwwj0rpeq4atxe8tktef2 +2025-11-10 17:23:00,bc1qp68ctlw77lkyqjyxfn200ergk079jqpxny4yj2,bc1qp68ctlw77lkyqjyxfn200ergk079jqpxny4yj2 +2025-11-10 14:12:11,bc1qa3tac82ncmpd34cy2g7a0xvhm2eh9wec5dcnl0,bc1q6f3kdsu9jxvk4uv62nzmdfcmy0pu6k7y7p7w3f +2025-11-10 17:59:43,bc1qdkkaprszrgxmn6umkrs7fzufpzrefdzz7p5heu,bc1qc2qw3a5m035jcaughewp8w2mst0w788fym5fmn +2025-11-10 13:42:54,bc1qryhgpmfv03qjhhp2dj8nw8g4ewg08jzmgy3cyx,bc1q08hcxrtxl28erd7tmevja57u8af76at6qukmfu +2025-11-10 14:56:34,bc1qjq2l9469dqklhejlvkr3va9qrd343mwrtzu4v4,bc1qlnrcexfn0z6n64zq8jj7ulful6y76gs5yvme3w +2025-11-10 20:29:45,bc1qkhl6epe73jec02jv3sp2lvdfhppyg874wxs9l9,bc1q7t4vyehjsexdme84qhdgd4dawcn54djh0m78fz diff --git a/python/tests/data/btc_dataset/csv_directory/part3.csv b/python/tests/data/btc_dataset/csv_directory/part3.csv new file mode 100644 index 0000000000..a4311629f1 --- /dev/null +++ b/python/tests/data/btc_dataset/csv_directory/part3.csv @@ -0,0 +1,51 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 21:05:57,bc1qedynrxg6cslv6x09mf08xl3k2ezxj44dnf5lql4tfjtltl5csr5saaxpq7,bc1p3szdkat5d9p5chwy5jre2fz6srsarmhl2vset0v77wd92pz3n2hs5ly07y +2025-11-10 13:42:54,bc1q3vm68x9hgdftwhwx6fxv39l0l9tz3l3vvpln0c,bc1q3vm68x9hgdftwhwx6fxv39l0l9tz3l3vvpln0c +2025-11-10 15:38:35,bc1qul9vy6f3lvr70tmnkpk26gv07vc2j7m8fghu5g9vmlr94w36tqksasw04q,1ysUBxNE2bwbQSDncJHWpuAhK9fdPUTb8 +2025-11-10 12:17:32,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1qjccwa37c4eckaj7sym6qc5933g6650yawv7uu8 +2025-11-10 12:17:32,bc1qv4km03y5kmpqan8hhytj2ysw7a9uw3jnz53f7n,bc1qjwm26n2yavglfe0l5335ffkhgla95z5wcfzshc +2025-11-10 20:29:45,bc1p72yx4zjhrrzes32ful3nwzxkteukyt4fq0ulkncpj8aaj32yt80shukvy6,bc1q85ywegp3yn7c7nnavgkfv0gjgsvaz4p3yfck62 +2025-11-10 14:56:34,bc1q5v8yfeevlwrxr6hhf9fvtm3gs39k3rup3x48af,bc1qvjf34jq086xt6k8dlkt40h8sh7qpq4z0rn5g95 +2025-11-10 14:12:11,bc1qsqcwf6pg4ke8ahmy78fvwadseav2lgd3rxj6dc,bc1qf63zft4rmknjpefue92gqnkqppk3jvtty53nr7zanrqd0kathhqsjv8gfz +2025-11-10 14:12:11,bc1q58rmazkczke70g9gwsggpnq9rarnn4mxchmnzq,bc1qzp7dh2r0xkyw4menu7th9k8nk3qp540myml243 +2025-11-10 12:17:32,1r2qgPBgdMNiNvUhWSSD5dAiqNb3pZDDq,bc1qkfaejemkvpze78lw86cwhpnfr62q30vzycgvx4 +2025-11-10 18:01:50,bc1qk35enqhuhtcfslw3h6zguaed4lv5cm57npd6l2,bc1qarwf6apwh52pngnpnlrm8dpzkdw0zvchxz4w7g +2025-11-10 21:05:57,bc1qmy4v6rlj36aal9yu2jfk4k43ef7gfzdvu0xe23,bc1q3r6vhv6r03qgh6jkqns2nls3yzpssagw3st4d3 +2025-11-10 14:59:57,bc1qhaper2xq6ajm4u2398vsxc0x74lqx7ufmy2c0varj8y5usfyx0dq6ndfnf,bc1qvh6jqvnjas2e3ftygrg465r3y57qvy94a8mqaxakanmhwzjmfz3qjmgq7f +2025-11-10 05:47:18,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1q9wf2z7rfa5a4uyng2d3lkqpr06ugvsn2grm6r0 +2025-11-10 00:28:09,bc1qv6xz6gvg7654exxm2m65qqyj56a9qayxkvn5yh,3BNb5kNf69bDqU8wRGUXbEUK6WwzwiE2n6 +2025-11-10 14:12:11,bc1q5pa50wkmfqk7zjh3sndq5n95he3ehty5vmlqex,bc1qs7qhzl77j5fd3s0x098ga8uwgnu3jc9ucf3lnjkre9tr70tjg3gs6vlr3p +2025-11-10 21:05:57,bc1q27gjd4z6prxzhd7kw505p0qs58vn49eahhvsft,1FmsCG9dHV1ea3CeyayphxreisWTFDmwUb +2025-11-10 21:05:57,bc1q6mtzzhl2vffy90cmux747dj4v7lkhu9rvfnwh2,bc1q6mtzzhl2vffy90cmux747dj4v7lkhu9rvfnwh2 +2025-11-10 17:59:43,bc1qhp3yg84skqr6f7m43junkhpzcxv02u0uzwyfye,bc1qmtd34z56we7qgdra7k9zu3zqzne6l2z6x6k83u +2025-11-10 17:23:00,bc1qdmyqv5p4e876y3l76c05a636zzg8jam7rcfg8v,bc1qxslzk2hya95w23u3vmzjtxv80lspnfy4fdpx7k +2025-11-10 00:28:09,bc1qamgjuxaywqls56h7rg7afga3m6rgqwfkew688k,bc1qul5ls7sawrl8vh7yt729esad26s47xcz7llz4s +2025-11-10 20:29:45,bc1q4vxcxw7mpg9dcryqu0kav8awrn7qk5e6wgs3hg,bc1q7t0k8uez6zkcm7g0vsqq47kmr8x9ye3xvtawywvqwt7zafnsd6aqhpe7a7 +2025-11-10 18:01:50,bc1q5s4qwh68655xguj43ppxgwhh2r05z3ggx8prfv,1DExp9PkcKvDL4HAqbykBctzkDHWiLE6Mi +2025-11-10 17:59:43,bc1qwx9tp5jmzz9a3mzafnx2htp7x0ueu5q62mxnyn,15WkUpnBrCoRK5v3Phj4aKoMjeBHo4ix8E +2025-11-10 14:56:34,bc1p7mx3edp7h05s57p57cj0wd4jvqvutaw5szwklvruep8sts8xe6uszux6sv,bc1ptp8fyw450mz85a5tf09vy4thw56q0qw0ntm83tvd6ep3hzje52yq0jt965 +2025-11-10 12:17:32,bc1pprfs443mkx3q3902hgetdkq5rg64vzmn6emnn5f6saxxfe8uk8vqlt7a2d,15E8oCgYt5bJBaExy18h2wJdm7vkjRHKv5 +2025-11-10 20:29:45,bc1q3zph9r2h0e2v29s8phj57u4sd5l2y6pu48a945,bc1qhghnchwd5j5av0vslgmz76l598zr3v8yrrcvaq +2025-11-10 20:29:45,bc1qt3x6hvy29mwsshka8d3zzp5ylf4yjhc0jvlljp,bc1q2x4xyl2fvqnmvy2vt6u24e67lvkmq6hxqkkjte +2025-11-10 15:34:09,bc1qnfw0gkk05qxl38mslc69hc6vc64mksyw6zzxhg,bc1qsqfl939qdz0vpwkynqvawntv2gzv2394wjp8a8 +2025-11-10 13:42:54,bc1qcjqyhhd2mlgm2x76rw7pfqesqxae2kkkp9lrek,bc1qhm8d7ece35rk9umwd4d2myk36rnseq8wwmaury +2025-11-10 20:57:25,bc1qwlwvahdkphj3rwwc6ln6ww8jkdy7lw4vw3ce5z,bc1qstl923xpdarm44573p8jpeh7lu2mp2qztp5pcx +2025-11-10 14:56:34,bc1q760uuh9m67gdnjq594ywkyf6m9axutmxuy5xpr,bc1q48rrl4cdyujdls96yse82e5l7y7lmsyfnjfy6m +2025-11-10 21:05:57,bc1qn3c2efvw6vmwqtmqf0tv45pv7g6ftrms9yhrd5,12sxYzAsmziEjtAvKkxqE4w1uvDaTUXwXW +2025-11-10 00:28:09,bc1qegxth00stfmkuneeg4me6zlvyyl904u0lcaz2c,388JzjgiUn8JDgzaWTHxTZ9UbF3jPFMJg3 +2025-11-10 17:23:00,3H8XNZk9YwcpUAqRYPLcoVNMH3o18UrYhQ,3GKVcNr6xnXmyEpVu7h8kEusryGfy16bU3 +2025-11-10 17:59:43,bc1que6q8d3wt5e7xz6x8qtp30euh70fgvkhyduwqf,3HTtPcoAUpq1Mma2mE5mHUrLDCXahJg9Jn +2025-11-10 20:57:25,bc1qh8aq5wxlrq94m467gd8l40rv3v6ja98fe00f3w,bc1qrh7f96680090mup2njzlasmwlex50l2ntjur4u +2025-11-10 15:38:35,bc1qs0chk7re599jqdr7z3vpsftc9ut7du7scxnyrr,bc1p2ncdhtdzf28enksd2syp4033atm4jq6j9v2py7uqe2n48jf5jelqe5jycq +2025-11-10 21:05:57,bc1qyujm66rpnnfnn6zkm329gj75r6e73tk6zv5j50,1PMS23kYZv4VT3zPBSG6j8w4tjG9VVLr24 +2025-11-10 14:59:57,bc1qsnw4rsdg6tgrnrnxvjq5qnyy2j23w82um58tw9,bc1qd6t23tfrg82zpapv0tepvsm59r030mafdqx4pp +2025-11-10 21:05:57,bc1qr4v4nl2ay04tt2uz6lscy47wj277ntf8cee9rj,3Miq328XhYZVgKv8vbkxF4KxbGrFPnccA4 +2025-11-10 14:12:11,bc1qp74l26w7t27r4jedcsx3lcwn50cljwtqs0yan3kjhx8e42py26uq35f4j8,bc1qhh5ju5mu7w55p4zz3sfl938xuqu6zhdpnx9rlz8m3umqz8vkfltsmfeyt8 +2025-11-10 14:56:34,bc1qu4yaxft2tzuva4x2tukuwuus5d9hdzj02ttxnm,bc1qng6tdu6sc2aw4x42lmg6ntqtxd4ut43357dhhv +2025-11-10 05:47:18,bc1q4euuq7lcs2tnt8fgdxaylm8034q7snaekt7p24,3FN1CiruvagBhricdJMiySa1vD9h3emN3g +2025-11-10 20:57:25,bc1qy9ed5tc7vhs9dkad6hhkw8t4wh56jyg0czxmk5,bc1ql3xnnz4kw3wr080zrk7wysmzjz7sr3rjspt82a +2025-11-10 17:23:00,bc1qa5j77xexgsl46ahrzeuktv82g86qdalxn6e3at,bc1qvhket3vgyyy3yulclmr8efq4luwvy90a4qwjle +2025-11-10 20:57:25,bc1q2zsrq6vhkkzyr35hzkjgur38nhpj86jmez92cs,bc1qeldfg4ytnzsjk553mr2cdlk66usfz22pyz67dh +2025-11-10 14:56:34,bc1pseekhxtjhmvs0xlc5mzmqs0dl8xcjek26ujwpg6hqhkp3gc50f3qjuhlxp,bc1pv2tdl4x285n96a5e9hh36mt9xw098ktqz5hnywe86cph0pplwaqs56yw85 +2025-11-10 17:59:43,bc1q2gchdrgwlh5tzwm0spph77a7mj5pq0rc38aahk,bc1pcsc7utumn86cklrkzyusxdgc9e3fwj6mxu9j327vwts0cv8e2a3s92ddg4 +2025-11-10 12:17:32,bc1q0qfzuge7vr5s2xkczrjkccmxemlyyn8mhx298v,14pXPZVfyeL6gxKZDVagbQnxQhXMpM2Thb diff --git a/python/tests/data/btc_dataset/csv_directory/part4.csv b/python/tests/data/btc_dataset/csv_directory/part4.csv new file mode 100644 index 0000000000..a529f74dd7 --- /dev/null +++ b/python/tests/data/btc_dataset/csv_directory/part4.csv @@ -0,0 +1,51 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 20:57:25,bc1qhcntczjrk7n83736ww45zrhhtxgwll023qy3eu,1ETdQMChucreiaNBjPDYV3R278ChRG3c7m +2025-11-10 17:23:00,3FkQ5nZWyHs7u63PgafTQ5jBK3TrLDcfRx,15qdGZi9vDYp7cADq2jqzpo6WvpVmX7d4p +2025-11-10 13:42:54,bc1ql6sgtq0uwh67un03dz4nt26n739qyu2xatgz92,bc1q7tpvm5d0yyv9lxme5y73s46n9cv803ue3gwevg +2025-11-10 00:28:09,bc1qv2gc72d0j8dmz530vuxlgyj25j4kz5s0050trj,bc1qv2gc72d0j8dmz530vuxlgyj25j4kz5s0050trj +2025-11-10 17:59:43,bc1qcr4jzax2wjyt5lkpqkhs6nuvrmfhmapsh9j8rp,bc1qe6asyl5njvyqc39qf2y7g5vqzscd9jysjwqs0k +2025-11-10 18:01:50,bc1q4y8s9l2yck6dvcejpmn90phdernngxcjwapqge,bc1qyktx2nxpjtrn07ftkpxsjv9c9atx7xh0nmrcdm +2025-11-10 14:12:11,bc1p6qvr6ceqgzmxdqrcrg2nzq39e3sd288sxa2reac6e285qq8epu0qgkdv9g,bc1p6qvr6ceqgzmxdqrcrg2nzq39e3sd288sxa2reac6e285qq8epu0qgkdv9g +2025-11-10 15:34:09,bc1qc5yxr9qkps7gfpkeg9xvptaz494n5eh4s00eru,bc1qcq5r4zg5f75ef0ps66nsvj3m03mjln9rcqjegm +2025-11-10 20:57:25,bc1q28f9lrqqaxly2jf2azfs36dyl7r6efcy8wkwew,bc1qz94dj90ymf87w77wfdpe7xyq6j9ngj6x4agyt2 +2025-11-10 14:56:34,bc1qtjuc2dqz34tkzs4uwame8rhyvpgge0gy6knhmy,1HYadqXeegRjnqDAYBkj92o5hF1pPJu4sb +2025-11-10 20:57:25,12rLYV7AQfpuxZpPXdfUqZCs7VCNp95qq8,bc1qu8xl7f8jkv2a58j0mas98r7gwecqqmnw6lkwt7 +2025-11-10 20:57:25,3FkTTwxagg6p7rs4d3GnQpZbADpatufQzo,bc1qfyfqzgvzxw2s2w7vms4yalys96aeqlq4rm0jxx +2025-11-10 17:23:00,3NhHCdt4RYXPjVQiYiWyRRyqqP7ik8SS9t,1636RnPVv6j8mTyaRmprjSpQCPAjD5UGiA +2025-11-10 17:23:00,bc1q5u709x2l7lsleprw264k5xj4rpmmmhhrurpkq7,bc1q7p54e7uarkxjlgc6qzwmugn7ygvmcwss262l50 +2025-11-10 20:29:45,bc1qrz8a6d4z2xnd2e3lnkd45v9jc5vd65t6a0pgzw,bc1qz4cfzstee7f208cxdca8v70ht5fcv5lypc63rv +2025-11-10 15:38:35,bc1q769n0hz7a9j038jdkwwd4lq3xwrkaskv6y8sp7,bc1qwrgtmyau0h2ar0guws8a9c0sa9vmjzqrlt7620 +2025-11-10 14:59:57,3M2nVoRZJgkxNHG7W7zp48xob11mbCTCKA,bc1p2d25ns4cf85dkk6jyytjeg4fcfv22lja6wwe77eltpwc2zd2yzqqyky6jm +2025-11-10 14:59:57,bc1pq7s7kpp90z2d4s7hzfxj32n72acd0987z3u2wm55ltst9fwelelq4emgpv,bc1pysearh9me9sa4kfkne6hu3jq96shjwdtrja2lf8uk4wn2uq5jjms72j3h4 +2025-11-10 17:23:00,bc1qjnmp4gxpp4u056dapqngwq2asw2ty74htrphx2,17Q8GjyhXZcf5RMmHGU7vznVvKpwGB88PQ +2025-11-10 05:47:18,1Nvv7ihqwz6Hh9rG4Bk64K7nGT7y1g2Wa7,bc1ql5zu6awgrz7wwwq6369kvc8fqz24n2ts6hn7y2 +2025-11-10 15:34:09,bc1q8a42mx0xfeyqy90zfkludfdu3c43w4a0jfw2ps,3QRLixkesAcqc268rikzbaRShSwSVydSSE +2025-11-10 14:56:34,bc1q5gpyv756638njr84s8uzeq3v59e97ha42hj52s,3FZLUQmcFTibssUKCJiNPtu9pXexXgoVun +2025-11-10 18:01:50,1LtjGorQ6FeNuC9S1oThWZQ6b79VvqH6Xp,1MqBMKLfsYq1xMwvwzZe39VAkXV3RYmNmk +2025-11-10 14:56:34,bc1pwjzpf2p4drax2mympx474phluyzjmpl7udnw9hmx43hxgc5w28vspms2c3,bc1qwrcm425acde6757923pxjkvlees9tww5xuqf0y +2025-11-10 14:12:11,bc1qrjkmdkhewjktx059nckpyqqlxazvr7kyeg57sllw7ksgenfwda2szu8kgy,bc1p2pxfzgune7ked0gldvt3j7zzusjv4t2uphhaf7pqg3srqdpuvngse97236 +2025-11-10 00:28:09,bc1qk90plyzwtzweulus6mmf9sd0zndplwqp26u8fy,bc1qeku5u5emyu6lgazyd6zntmfvkzffnk4yxm7l559hl0rku600798q533jtx +2025-11-10 14:12:11,bc1qmlj36ml4nay4tm5gj0h4u769uwpww6ucyqf0p2,bc1qp7ekeam9lauvcmltkhfxlyzd3udzmd39syyhh3 +2025-11-10 14:56:34,bc1q63qkttxua3aw9umnqqptkw2468rs5jne547umd,bc1q793gj5eml2u9hpgqm6y3xjgmfu669xy0aqzfa0 +2025-11-10 05:47:18,bc1qmptzs6czc3mlp6hy3932kka5687vn5fd9cnecu,bc1q9yn6zdkjjlh0z5y6sqpdvwq7pwkeh5r0ka28ad +2025-11-10 15:38:35,bc1q04qcmmnmd47dc8mjn9fcvxcjdk5f6edg6f40mw,3FxCJ2XUyFEup66QDaxgcHF8az6P19wien +2025-11-10 12:17:32,bc1q7t4vyehjsexdme84qhdgd4dawcn54djh0m78fz,bc1qcu0wrnx0002g2ka4sr0nnrldtff6dvq433unh2 +2025-11-10 20:29:45,bc1q3ms0mj7jtt9nd5smhv50uvd27czetxxlnlzkuq,bc1qcu3a5u765fdhddzccjy3k02uzewarlwd7yyn82u0fv42508n650s6w5um7 +2025-11-10 00:28:09,bc1q6dadscrdytuwjeedk9fr80xwmnl5prqvhwy7aga4k3fmxwhzvf6shuzpmh,bc1qdfcstw5dcud0dusq3tscs5khczqc7esn7psa3p +2025-11-10 20:29:45,bc1qtkya7nnflevqx2tgjajycw2gjl0y4w7626lad0,1ACE9sy42uw8Tns84KjueMj8koezrcZRdG +2025-11-10 20:57:25,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1p52jjweup7chageaggu4cj8jl3avylha8zrr7lgqkth82tancdrxqlhnvzm +2025-11-10 05:47:18,bc1q3dc2ec45m8s49u9rlv9y8ruyr644utc3hv7ncm,bc1quf7hjdq99rldlyqmxaz9sd3unm6j5fv6yulty2 +2025-11-10 15:38:35,bc1qsh343fpz0mtlfl3k4xzu5qru0uprdyh786lfsu,38qZsSoHitwa7XDicxT8xewyXPu2VAvhhh +2025-11-10 17:23:00,bc1qtgvxhy2armx5xajkfmdks5h8y7cqmqgwwfdrp2,bc1qtgvxhy2armx5xajkfmdks5h8y7cqmqgwwfdrp2 +2025-11-10 15:34:09,bc1qr7whs7crqjpqshv7qk9ex4742wagf9gnznjulc,bc1q20n8jugfv9c224fdfxe4vgugyd2gh7uaytt9kc +2025-11-10 00:28:09,bc1qnfw0gkk05qxl38mslc69hc6vc64mksyw6zzxhg,bc1qjhv7fcemmjx4temc4d2z500jv5fud2y3rtwwg3 +2025-11-10 20:29:45,bc1q3jzyfvu60rc3um5rah7y6j3gks3m7jffqpw9ef,bc1qc29w7mwejcuklrhqf0e8l6zjys6x4sqzkns2u4 +2025-11-10 20:29:45,3J8dPt32vzUdQzvwXpicG2XFcaG96dnRZt,bc1qre4tzdx5r8ckzhn9ffrxwusvugdfvee9n0v5y7 +2025-11-10 21:05:57,bc1q2dzekmutn0s8wh5ty9kywgddcl7j796zju8aql,bc1q3u7r770vyc5v6hae8v5pdv846wq430jhdfz40j +2025-11-10 20:57:25,32bZeQ89m2oPeM6wLKeYdvzsPNBDb3bGAP,3Psvpa4LQtEf2tR9i3VJwGRgHQsFPJ8rf1 +2025-11-10 00:28:09,1VEmWQLu9iohP6RMmabnKcDJuCkyk3E85,bc1qchpdg4wnyaswyfggfatrrwz9snasrc92wgzhfy +2025-11-10 20:29:45,bc1qlw4565huuxsr03dz3sepexjv6ujmfy2amye98d,bc1q87s3wsnzdhlclqpymykpkdm44ryv66av5fv08q +2025-11-10 05:47:18,3FfS44EtZhTBb1XXQPXcjiVqxpub9gncz8,bc1qft2zpj0wl4zqghk4lad6qr9www4zrrseuv0y5e +2025-11-10 20:29:45,bc1qsggexuj2xdmne5kvj2mnu4ur2m2qjpwlyrtqtf,bc1qfyaje5au3xzwcdmt2ecct5xneywrqaf4p46m22 +2025-11-10 14:12:11,bc1q4k8t9a9jrzhcnlyretxgz4kqc5hlyruvra5q5p,bc1qlyy7f7cu2rptc9n42khv3lxrc3pzwp58aa8qlp +2025-11-10 00:28:09,bc1qlwjqwjugrv5c5wzg3hmtj9m72tqj5mnqeazrzy,bc1qlac25q65m2wjz9fjzqg382txhycjc495gs6ez2 diff --git a/python/tests/data/btc_dataset/empty_directory/readme.txt b/python/tests/data/btc_dataset/empty_directory/readme.txt new file mode 100644 index 0000000000..0fdc877bba --- /dev/null +++ b/python/tests/data/btc_dataset/empty_directory/readme.txt @@ -0,0 +1 @@ +There are no CSV/Parquet files in this directory. Ingestion should fail. \ No newline at end of file diff --git a/python/tests/data/btc_dataset/flattened_data.csv b/python/tests/data/btc_dataset/flattened_data.csv new file mode 100644 index 0000000000..7d89529e74 --- /dev/null +++ b/python/tests/data/btc_dataset/flattened_data.csv @@ -0,0 +1,201 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 00:28:09,bc1q40a3a5xlwv0l4n4xs3mpvauscgkj78rcuhvrhs,bc1q9xh9exh34tq566nfvpwwupum6efh3yqc0aja5d +2025-11-10 05:47:18,bc1q2zn0548tc0njq9xxt0uamxxh48s8t7c9dw0a45,3K1t5MthuwcyNn4BWHp4eFSyTY8ifDjTkk +2025-11-10 12:17:32,bc1qxsp4nqu7zq424wdz4v7c6q8jylhtlskap86kpd,36jsyuJHY2Eqs48yjUb6xyzARSPYKKSrTF +2025-11-10 00:28:09,bc1qvpah3cjguuwmjjgwp2g9t52f786jnw7km3jgau,34WkZeqyNBb5FXMxLZJfveHNm2kqngt4w4 +2025-11-10 13:42:54,bc1q6xku4eta99267etxf84yy3qc2pyw04pq8dvqye,bc1q6xku4eta99267etxf84yy3qc2pyw04pq8dvqye +2025-11-10 13:42:54,bc1q50uxvgu5paqj22fg27s352wggev70xs9f8lhnk,bc1qxgxf44zkxrz8ld33kjvf85ekmmkcaexs7xkfyg +2025-11-10 05:47:18,bc1q5v45pufhugrqmp9a7taq92y4uqu8awpqcsrjq2,33bdcPeLq1Mk8TrHZ6sPwjpNbfHf6Y5UCg +2025-11-10 15:38:35,bc1qhkar0wasylsfhwamp8d023e2tqzcjfzjlx6wrd,bc1q4mhkgdfmattuya75rescq5u3ms4u3m56x8nzcz +2025-11-10 18:01:50,bc1qw8wrek2m7nlqldll66ajnwr9mh64syvkt67zlu,bc1qxwea59rqnldu5hl2zg3pt9c0av7eee4dfjnj7u +2025-11-10 17:59:43,bc1qn5yd54kvaa60jqw0ncx93ru7ctlk0727653ynz,bc1qehhvcqru3k3qfn95r9w3nf8jxnlflnw4d9duah +2025-11-10 15:34:09,bc1qr7c9p424qed5mqy33luxpu35wmxdl8vd383xze,bc1q26h4h4v5u9m0nk8mv9mn2pfpmcyemeuntee8xs +2025-11-10 20:29:45,bc1qsaqmj46l3jl42vczyqa34fqukdwdxwndj3f5ah,bc1qsaqmj46l3jl42vczyqa34fqukdwdxwndj3f5ah +2025-11-10 20:29:45,bc1qk4jzsk3qn6dra7re4cs4tjezhz4udr86cvgs0x,1NoeLVMHeSC9gvw1FdAnmHTNrHX9iTNXoK +2025-11-10 00:28:09,bc1qlmgaecwta0h2k3wc0yx5gfeagyj2u4kq7p525x,bc1qvrwer4355chxf9zcyd8lweaesk30r797f96v7v +2025-11-10 20:57:25,bc1q2dshq3fv3nxpu0h5t06a2drcpl89u9xvncdena,bc1q6vqedrdgh9j97jp4nhamyjqpf27rtldwr5zm07 +2025-11-10 14:59:57,bc1qf4g86wre0gmzxxh2c0p5892ne4rdrlxe28pazd,bc1qvmf3qpe3lt8asy8g86vxvjnzkn366gn2q0qyly +2025-11-10 18:01:50,bc1qex279aerh885fr62m0lnv3htvja2kw9km9c2aa,bc1qex279aerh885fr62m0lnv3htvja2kw9km9c2aa +2025-11-10 00:28:09,bc1qdj7cz87pd3st0ah6yhy5ltmv423ltd0xxycn7r,bc1qyn66t4wue704rz4r37mye65hsqypg0juq89068 +2025-11-10 14:56:34,bc1qjpdqs9fjgapv305p2w7lxh7zk49rjjrqzwqguf,3QNQ49CBeC63Z3r3C2D3s5grMVKZQTLQ1J +2025-11-10 17:23:00,bc1qxw0vxcnwta0n8q4s7apjtsdc72uqc8dt5wvraw,3Hs4uUpc2ntg7ajWjoiHBEksU5182GzDft +2025-11-10 13:42:54,3NtA69Kn6Z6w8A7eQNXyz5ijYZxnynvT7N,3QCCr2s4NHn2BR2oTKNcZPFGEf5cjt4BDP +2025-11-10 17:59:43,bc1qcpl66gg9wpyj4g9umh9ye0mtsgtzmmlmrcjpr5,bc1qdkwxe7cuq0z35tfanuavvmpxrkqlhy72esnj98 +2025-11-10 17:23:00,bc1qlrex5kmzc4kdvwjl0j7sm2pvewvwaylrnrlns7,bc1qs2en2wud3q99wq7ykwdwrwzkpxfluchycw6wvs +2025-11-10 00:28:09,bc1qhazv3u50uvv5lw83sgd2dnlt80j2fn2j39h4rz,3CFP97ZCKiy6dmv2wRuJtUUYZcGMXjzfKA +2025-11-10 17:23:00,bc1qch5udvlvm9luneywcfjxjfvy6wuy6hysku8vw7,bc1q3qsupsucu6sse2uwpc2hdysf3mu5wfl9rnexvu +2025-11-10 14:56:34,bc1qenk9rwm8phgfy0w9zl2fdsejxhxc86sm67y5we,bc1qzclu4epvxu9e6a5c2mk650fzdw4626352lyf2e +2025-11-10 15:38:35,bc1qxgjzdgp0yrh69llsgepm0nfrj2h0l9cjvk9p7w,bc1qzttelzj04e87qcvfujke7wdtfldug4zmugmwh8 +2025-11-10 14:59:57,bc1qdyr4723zc8azryex3weyspdhhdptlwurfzej5v,156kgpUXXggfZeyFdgxxs6Eouok9MscqZr +2025-11-10 20:57:25,bc1qsc5q87uem04xzedure8u90452hgx8h00y96ssa,bc1qmcx2377el75jxxg8y5673srdgj7v0jgjn4y0u2 +2025-11-10 14:59:57,1GbUhB5Aub17yjLaFf1fPNFN8icPzXn3sU,bc1qsj84dmnupc25zhecdxfhqnhtg4g63w8h0q260g +2025-11-10 20:29:45,bc1qpsys7sfk5u7ue3lffwzszzvffhtku78kr0vva4,bc1qt67jql0p02yvjncgqvfjzwnc2fwhjszm3d9jhmgsnkpxkaf3j6sq6cdn7c +2025-11-10 18:01:50,1NKKRqJDwJjdnYL1Wn7QsUjxdDxqJAL6vG,3GEeyC3sDSDQYMnsfYdMUJGXpGoXnz1Q5B +2025-11-10 18:01:50,bc1q77flh60ngm58qzpuhcvuxtzp2z6qc3ww52udm3,12tPKMdmLCReFCjLiif7bng5VX8G7WTHjw +2025-11-10 15:34:09,bc1qr7whs7crqjpqshv7qk9ex4742wagf9gnznjulc,bc1qehv33h4ez6drn55yqwa0v8mwzpzcgqsu2gkejyy3e24m4w2az2jq0mcke9 +2025-11-10 14:12:11,1N6iknsjUx9R9JiarC136TVCJzUSpjr8V5,37exsSwT2HfBqgeM68kzKXvG4aJXW6ERnE +2025-11-10 14:12:11,bc1qtz6tnlusht3la6p6gnjfgd3ad6zneuqk64k5j8,1HcmTmdkbmVbuof1vurum74PMcmC5Lf7PM +2025-11-10 05:47:18,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1qq4e2xm40vgmz79rrh4l3zrrf7eg96l2q3tw4kv +2025-11-10 14:59:57,bc1q9r8pgy2xquxj4ph8gjatle587dsvqwjrmyflx4,bc1q0prfw6u83g4pj9vyzpzgxpq5u4xtn3kj8ltg2a +2025-11-10 13:42:54,bc1qemdd6pq7qar7hs33jvvsv4tlt5edasxk996hda,39ohXNyW4mXo652kA5CzsH87HDwuDmmNuY +2025-11-10 15:34:09,bc1qmzhda5986ssuf45x2qzymtuxlc55qs7ddds7ae,bc1qwujtjmpk7eavveax6e2crygzsz59pl7qnudp8a +2025-11-10 14:56:34,bc1qzkunkrkz6q5ut6wwcjktn3fshgshx686d0xqtg,bc1qpzuj9jl3gxarhdt6rzxynk0cs9a7vw0acp5chf +2025-11-10 20:29:45,bc1qnlp6y2cxtnsm4u6xfn6yjvkvjp4u8x08fsp4yq,bc1qkc2quddu4nutdn7p0gcz4kpenex6ukxqjmxj62 +2025-11-10 14:56:34,bc1ph6td9w6mfvj0s00vj3240pclurgmeenw4a2c84vppnswufwmp5mspmp28a,3MP4GcjaoPcFunef59ZxtjNkwb3gEw2ZPS +2025-11-10 12:17:32,12RVBQfdCa4ctMCFrwPGKz1iLSs5wwdyoV,bc1q2nalwjxe947fe4zl4rmd6g97vfrqn3af5ema38 +2025-11-10 20:29:45,bc1qqf89nhg5pkrmzafsranjn6rku5nv484vtagfuy,bc1q8pkcam0ut9e3c2azzj5292q4tdszztqn7mrxfq +2025-11-10 17:59:43,bc1qns9f7yfx3ry9lj6yz7c9er0vwa0ye2eklpzqfw,bc1q8gelljdlq5dyg20hsacga6my0k462k0muewp64 +2025-11-10 12:17:32,bc1q2sc0phtfha7n50sd250g6gpx7w36y6hr75cdd9,bc1qvdsw7yv7e6pz0p8pxfweqhu0qhj89tvyc0wv4j +2025-11-10 20:29:45,bc1pw86pslqfvc6hhxalwjcsdp24te6hnyfxhvckkf8fjmf3wlp42vwq9yqwtt,bc1pykfp9h5x7umf3ksrgnff44k5yen7kps5gz2ee8znkltp7udth84sq37w06 +2025-11-10 05:47:18,37jdMXYbvg3dKzJ4pGSYiABiXoBy4putZq,bc1qrs2d75werh8nmvr4makktpyuu4l7uex9yr05ja +2025-11-10 15:38:35,bc1qep5xck85tsry58d36xxvdpwwh3s4vm04v7t4cw,bc1q95lyl6wtxs5zt7dna9qwjfvg3k84n5z87fuyuc +2025-11-10 17:59:43,bc1qthv7smt28rt2ctyg5ec3vkvx0hl4ckks9m75c3,bc1q2g6nxqvv2s09plap4x4j6pgnc2y42lmjy8fpxf +2025-11-10 20:29:45,bc1qt9nvg8mwuqmjlruc8z9m4kns2hd3gk2ylsdryp,3JDMdeEtsEfnABR6AYaHNU8fy9B7uDJduH +2025-11-10 12:17:32,bc1q4996ykxqxnhey7sh9fzvn3rre3mswta69myft7,bc1qlxsa2yndh2xt4rpmw5pt7eqaelkzd54psvx4xa +2025-11-10 05:47:18,bc1q449ruh9ga8pzd4uzds83zqckwp6yqxfvra7fvq645ev7sqefj9jsdkpegd,bc1p5xpw59psu6yyzmee8s8ptqy6gvpfls0zsegp5krw0x6dp2mtru2qcxq0tk +2025-11-10 12:17:32,3FXf66gb6NNMA52WSBBjJyMmJ3t75hLRDM,1EHF3NhZPLWEaEppSHqK5AVGB8zX5xTXEm +2025-11-10 17:23:00,bc1q4pmrg0q6ywlsd4vutmmv65g7n4mgp7tf2hz9gx,3Hqj4K5kofSnRa8MjJPt43yw5gd5f5sdKG +2025-11-10 14:59:57,bc1q3aus5ka7pq2wcjuesxqlaqwhxgckxxlx6t6ejy,bc1qwl8jvjvkyeun7wj09xkcpwt4g7sqajlzush8j2 +2025-11-10 15:38:35,bc1qh2cwxv899psevx5yhr9cvuvqpcwqr3ej4ns87a,bc1qkuwtp76a89y40nvp6235xh9u5rpumv0arpm8rq +2025-11-10 14:12:11,bc1qa7a3ndesd0n2srxzdhnkksu0hkc96rp5uyyq95,bc1q3e0xr3pqz0rad4uakctg7j2crnf3v53na9c567 +2025-11-10 14:12:11,bc1qvxs7r6w2z3aguqzdecql7t5vkr44q9gccgetyn,bc1q0qdfdjjp9w872r406tlff34tdy9wa7wvdgutd9 +2025-11-10 14:56:34,bc1qsw6zezpduq9985vxs48uhffv82jzxgl8rm5mpd,bc1qsw6zezpduq9985vxs48uhffv82jzxgl8rm5mpd +2025-11-10 05:47:18,bc1q2alxcv39eku3cvctdudkqf2y2fvqu2t49wlrj2,bc1q2alxcv39eku3cvctdudkqf2y2fvqu2t49wlrj2 +2025-11-10 12:17:32,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1qkqd2ewnd8mw358qw24vmwgepz8n8cta0y2pe97 +2025-11-10 05:47:18,1AGD42b91ycvAcMrCCQ1G5iiBZooPP9xqX,1AGD42b91ycvAcMrCCQ1G5iiBZooPP9xqX +2025-11-10 20:57:25,bc1qmv76fxychuzcuvt3uy5uy3seu6jvxrsfz4993w,14s7DCukciJLAFJsYDzPK7ZfFP2thzU1Yt +2025-11-10 14:56:34,35C2L1pCgwzBHNcDcVL1a5RuoefeWqyjAR,bc1qhfsad8wuvke9eczj5g4e287hz93g7t8nwn9gxj +2025-11-10 15:34:09,3HYjQZaytMNixSZwU6Dkd7QBQhs2yFcp3E,14BuunTtFx9HMvgJzJQnxR5VNgjXCKckSv +2025-11-10 21:05:57,bc1qx54ge94tzqtfjzqy0dm9z9q6yqdm75v3ed8g39,166dELRZqWwRjXXVdbahWLi7muS9A2jjJv +2025-11-10 14:59:57,bc1pthxvxwdparr9pg5s9dxl4s4u3f744qwn9atl3gcltvjx76hyg4kqx230gh,bc1pthxvxwdparr9pg5s9dxl4s4u3f744qwn9atl3gcltvjx76hyg4kqx230gh +2025-11-10 14:59:57,38Vfgc9RT5EptuLMQTrnJ65eZgQtDpneom,bc1qeq2flx8mfj3gn3j58uwaj5u2xzpt8qmvet46nl +2025-11-10 14:12:11,bc1q7cyrfmck2ffu2ud3rn5l5a8yv6f0chkp0zpemf,bc1qx0m6mzl4756vwg23jxkdpew03wwfze856phhxl +2025-11-10 17:23:00,bc1qkqw4qj5gplgkns6ysg6xd6986a4xrwpg9fsy36,bc1q9stxzl5x02rrq0cfmlfh4epumn6zvq6asa9n0u +2025-11-10 00:28:09,bc1q6nlqnu6jad5fygq7v5kua2zpr2h5x288szq5km,35AtUZgvWh9uAHhX6fMidJcbVTJTjC4Mcs +2025-11-10 14:56:34,bc1qm2safrrv8e6696sgtksv36yg066qcafc39mu3y,34Yq1C3TS1V5S8w3CNx3SN2W3CdjoGu9QN +2025-11-10 14:12:11,1GbUhB5Aub17yjLaFf1fPNFN8icPzXn3sU,bc1qw3qld7muf6feg5jlypcdfrmq4hzanx92xagswe +2025-11-10 05:47:18,1NfJSiqBw4fb74KgVQrPsk5W5aqitAD1Xv,1L6yJi7TcjztgX7W8ds1zz2NGEnT7GdoAz +2025-11-10 05:47:18,bc1qd9gg9qhkswtp26kujvc29zc6daj4yzv6qsgur3,3MqsziNKsuFwM9unjhAsKYZCQ7mNfYTSv5 +2025-11-10 20:57:25,bc1qdvdt8vjn4lpep3hr6kr90uwmjr7rf906ac5lxn,bc1qqz5ll7nwghwjg9e9d0wjpx54cs5rc6uu48emtf +2025-11-10 12:17:32,37jdMXYbvg3dKzJ4pGSYiABiXoBy4putZq,19WpQ6KYi2SGKRnsgkeX5ALgZjgSMPiqmu +2025-11-10 15:34:09,bc1qz9t7rmkf3wn3yuz48e0fhg8dvzf7hsmf3a6at9,3R1Ku4KBj6y9ekKkCJXnrVkUzGNjZtEXah +2025-11-10 15:38:35,bc1qmkavqx59gg4aucmctctww3nve6x9keln9s07zz,1HzijFYeyDDUKym1siU5hBiuyu9WVRPcVj +2025-11-10 20:57:25,bc1qkplg5aqltwln2ks4shezddemdffh9pmt73xnpd,bc1qgx3c07j5enqz0pfwpkchpx6cwh690zst9l4dz0 +2025-11-10 00:28:09,bc1qt2xxqsy0tnvcvml47t4ugrvm8p8h9skkv886uv,bc1q2sa6jadt6gry89778csh2lmu2xaw2javvq9srn +2025-11-10 20:57:25,bc1qlpygchhl3j07mh9z7gcsqzjfapyyurm9amvhh8,bc1q0z4v89tecuy5e5cr3hdj6ts8zz0ky79mmattsv +2025-11-10 21:05:57,15QKr87rKdX8g7kmJ2DBWEgbWiGrnxBTnM,3Ae1fKTuLvPNTqjAsuXznf6cFySEtcArho +2025-11-10 15:34:09,bc1q2z994nye47fnwxxy4nwukfg9kkq0m5xe7hxpq2,bc1q4saygf3hk4cl0hej8e5rr2wpdza8zga6fqshqt8mrlzgpt348chqgs532d +2025-11-10 20:57:25,bc1qagt6ng896jhghzuxhqzrcmkn0nq7tzpjejghgz,bc1q3rqkcktkqnem5yfzmrzya289m2xjl9vl37fgmxdwkhl3n7f9l35qk2u36p +2025-11-10 00:28:09,bc1qwthqxlv39qwxt6j5c5zdhxprjfkpy4qgre6nvl,bc1qc3h9tdncgalkv3yeaw5fxsn0ktdavxmvzmcm09ge4k59psxxx60qmwcmx6 +2025-11-10 20:57:25,bc1qlll42hhmtn7pwz6srexps6v8zm2tqp2p7tx4pt,bc1qzheza7hkc4jyp67vhtgrkaxpddwl3z0eada6fe +2025-11-10 14:12:11,bc1pah7lzc5rcms23lvnsrsj68atcagcg25j6kzlk7aytjtludwrl5lqclvw3t,bc1pmspgjsaxqfdvzkg7du7sgpclldwf0fajrkn36qgkrat99ugjz2ushjvq57 +2025-11-10 18:01:50,bc1qmgv6leqa0rpu40ycpndxg9hggrqdxcdgtmvp34,18N8SP9Ui6WKLtDWks6DfwMMMeJT4W5TDB +2025-11-10 17:23:00,bc1q65h44f9taql5ew5nzd2xzkmq35wcag2uugvj82,bc1qhzvug7trqm7n36g5azdqa968rl9ww5rvwzt69w +2025-11-10 20:57:25,bc1qrgl0yu3zuglvcgrdsglc5a8zdsud6n023naqaz,bc1qlg7shfvat4p0fg0n9rzap3qxryu45sljuxy4qp +2025-11-10 14:56:34,bc1q4dd592zay7mpw9arzj02pr4hr6e5yj603r29sy,bc1qepmkx8n0flmw3qq9puwwj0rpeq4atxe8tktef2 +2025-11-10 17:23:00,bc1qp68ctlw77lkyqjyxfn200ergk079jqpxny4yj2,bc1qp68ctlw77lkyqjyxfn200ergk079jqpxny4yj2 +2025-11-10 14:12:11,bc1qa3tac82ncmpd34cy2g7a0xvhm2eh9wec5dcnl0,bc1q6f3kdsu9jxvk4uv62nzmdfcmy0pu6k7y7p7w3f +2025-11-10 17:59:43,bc1qdkkaprszrgxmn6umkrs7fzufpzrefdzz7p5heu,bc1qc2qw3a5m035jcaughewp8w2mst0w788fym5fmn +2025-11-10 13:42:54,bc1qryhgpmfv03qjhhp2dj8nw8g4ewg08jzmgy3cyx,bc1q08hcxrtxl28erd7tmevja57u8af76at6qukmfu +2025-11-10 14:56:34,bc1qjq2l9469dqklhejlvkr3va9qrd343mwrtzu4v4,bc1qlnrcexfn0z6n64zq8jj7ulful6y76gs5yvme3w +2025-11-10 20:29:45,bc1qkhl6epe73jec02jv3sp2lvdfhppyg874wxs9l9,bc1q7t4vyehjsexdme84qhdgd4dawcn54djh0m78fz +2025-11-10 21:05:57,bc1qedynrxg6cslv6x09mf08xl3k2ezxj44dnf5lql4tfjtltl5csr5saaxpq7,bc1p3szdkat5d9p5chwy5jre2fz6srsarmhl2vset0v77wd92pz3n2hs5ly07y +2025-11-10 13:42:54,bc1q3vm68x9hgdftwhwx6fxv39l0l9tz3l3vvpln0c,bc1q3vm68x9hgdftwhwx6fxv39l0l9tz3l3vvpln0c +2025-11-10 15:38:35,bc1qul9vy6f3lvr70tmnkpk26gv07vc2j7m8fghu5g9vmlr94w36tqksasw04q,1ysUBxNE2bwbQSDncJHWpuAhK9fdPUTb8 +2025-11-10 12:17:32,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1qjccwa37c4eckaj7sym6qc5933g6650yawv7uu8 +2025-11-10 12:17:32,bc1qv4km03y5kmpqan8hhytj2ysw7a9uw3jnz53f7n,bc1qjwm26n2yavglfe0l5335ffkhgla95z5wcfzshc +2025-11-10 20:29:45,bc1p72yx4zjhrrzes32ful3nwzxkteukyt4fq0ulkncpj8aaj32yt80shukvy6,bc1q85ywegp3yn7c7nnavgkfv0gjgsvaz4p3yfck62 +2025-11-10 14:56:34,bc1q5v8yfeevlwrxr6hhf9fvtm3gs39k3rup3x48af,bc1qvjf34jq086xt6k8dlkt40h8sh7qpq4z0rn5g95 +2025-11-10 14:12:11,bc1qsqcwf6pg4ke8ahmy78fvwadseav2lgd3rxj6dc,bc1qf63zft4rmknjpefue92gqnkqppk3jvtty53nr7zanrqd0kathhqsjv8gfz +2025-11-10 14:12:11,bc1q58rmazkczke70g9gwsggpnq9rarnn4mxchmnzq,bc1qzp7dh2r0xkyw4menu7th9k8nk3qp540myml243 +2025-11-10 12:17:32,1r2qgPBgdMNiNvUhWSSD5dAiqNb3pZDDq,bc1qkfaejemkvpze78lw86cwhpnfr62q30vzycgvx4 +2025-11-10 18:01:50,bc1qk35enqhuhtcfslw3h6zguaed4lv5cm57npd6l2,bc1qarwf6apwh52pngnpnlrm8dpzkdw0zvchxz4w7g +2025-11-10 21:05:57,bc1qmy4v6rlj36aal9yu2jfk4k43ef7gfzdvu0xe23,bc1q3r6vhv6r03qgh6jkqns2nls3yzpssagw3st4d3 +2025-11-10 14:59:57,bc1qhaper2xq6ajm4u2398vsxc0x74lqx7ufmy2c0varj8y5usfyx0dq6ndfnf,bc1qvh6jqvnjas2e3ftygrg465r3y57qvy94a8mqaxakanmhwzjmfz3qjmgq7f +2025-11-10 05:47:18,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1q9wf2z7rfa5a4uyng2d3lkqpr06ugvsn2grm6r0 +2025-11-10 00:28:09,bc1qv6xz6gvg7654exxm2m65qqyj56a9qayxkvn5yh,3BNb5kNf69bDqU8wRGUXbEUK6WwzwiE2n6 +2025-11-10 14:12:11,bc1q5pa50wkmfqk7zjh3sndq5n95he3ehty5vmlqex,bc1qs7qhzl77j5fd3s0x098ga8uwgnu3jc9ucf3lnjkre9tr70tjg3gs6vlr3p +2025-11-10 21:05:57,bc1q27gjd4z6prxzhd7kw505p0qs58vn49eahhvsft,1FmsCG9dHV1ea3CeyayphxreisWTFDmwUb +2025-11-10 21:05:57,bc1q6mtzzhl2vffy90cmux747dj4v7lkhu9rvfnwh2,bc1q6mtzzhl2vffy90cmux747dj4v7lkhu9rvfnwh2 +2025-11-10 17:59:43,bc1qhp3yg84skqr6f7m43junkhpzcxv02u0uzwyfye,bc1qmtd34z56we7qgdra7k9zu3zqzne6l2z6x6k83u +2025-11-10 17:23:00,bc1qdmyqv5p4e876y3l76c05a636zzg8jam7rcfg8v,bc1qxslzk2hya95w23u3vmzjtxv80lspnfy4fdpx7k +2025-11-10 00:28:09,bc1qamgjuxaywqls56h7rg7afga3m6rgqwfkew688k,bc1qul5ls7sawrl8vh7yt729esad26s47xcz7llz4s +2025-11-10 20:29:45,bc1q4vxcxw7mpg9dcryqu0kav8awrn7qk5e6wgs3hg,bc1q7t0k8uez6zkcm7g0vsqq47kmr8x9ye3xvtawywvqwt7zafnsd6aqhpe7a7 +2025-11-10 18:01:50,bc1q5s4qwh68655xguj43ppxgwhh2r05z3ggx8prfv,1DExp9PkcKvDL4HAqbykBctzkDHWiLE6Mi +2025-11-10 17:59:43,bc1qwx9tp5jmzz9a3mzafnx2htp7x0ueu5q62mxnyn,15WkUpnBrCoRK5v3Phj4aKoMjeBHo4ix8E +2025-11-10 14:56:34,bc1p7mx3edp7h05s57p57cj0wd4jvqvutaw5szwklvruep8sts8xe6uszux6sv,bc1ptp8fyw450mz85a5tf09vy4thw56q0qw0ntm83tvd6ep3hzje52yq0jt965 +2025-11-10 12:17:32,bc1pprfs443mkx3q3902hgetdkq5rg64vzmn6emnn5f6saxxfe8uk8vqlt7a2d,15E8oCgYt5bJBaExy18h2wJdm7vkjRHKv5 +2025-11-10 20:29:45,bc1q3zph9r2h0e2v29s8phj57u4sd5l2y6pu48a945,bc1qhghnchwd5j5av0vslgmz76l598zr3v8yrrcvaq +2025-11-10 20:29:45,bc1qt3x6hvy29mwsshka8d3zzp5ylf4yjhc0jvlljp,bc1q2x4xyl2fvqnmvy2vt6u24e67lvkmq6hxqkkjte +2025-11-10 15:34:09,bc1qnfw0gkk05qxl38mslc69hc6vc64mksyw6zzxhg,bc1qsqfl939qdz0vpwkynqvawntv2gzv2394wjp8a8 +2025-11-10 13:42:54,bc1qcjqyhhd2mlgm2x76rw7pfqesqxae2kkkp9lrek,bc1qhm8d7ece35rk9umwd4d2myk36rnseq8wwmaury +2025-11-10 20:57:25,bc1qwlwvahdkphj3rwwc6ln6ww8jkdy7lw4vw3ce5z,bc1qstl923xpdarm44573p8jpeh7lu2mp2qztp5pcx +2025-11-10 14:56:34,bc1q760uuh9m67gdnjq594ywkyf6m9axutmxuy5xpr,bc1q48rrl4cdyujdls96yse82e5l7y7lmsyfnjfy6m +2025-11-10 21:05:57,bc1qn3c2efvw6vmwqtmqf0tv45pv7g6ftrms9yhrd5,12sxYzAsmziEjtAvKkxqE4w1uvDaTUXwXW +2025-11-10 00:28:09,bc1qegxth00stfmkuneeg4me6zlvyyl904u0lcaz2c,388JzjgiUn8JDgzaWTHxTZ9UbF3jPFMJg3 +2025-11-10 17:23:00,3H8XNZk9YwcpUAqRYPLcoVNMH3o18UrYhQ,3GKVcNr6xnXmyEpVu7h8kEusryGfy16bU3 +2025-11-10 17:59:43,bc1que6q8d3wt5e7xz6x8qtp30euh70fgvkhyduwqf,3HTtPcoAUpq1Mma2mE5mHUrLDCXahJg9Jn +2025-11-10 20:57:25,bc1qh8aq5wxlrq94m467gd8l40rv3v6ja98fe00f3w,bc1qrh7f96680090mup2njzlasmwlex50l2ntjur4u +2025-11-10 15:38:35,bc1qs0chk7re599jqdr7z3vpsftc9ut7du7scxnyrr,bc1p2ncdhtdzf28enksd2syp4033atm4jq6j9v2py7uqe2n48jf5jelqe5jycq +2025-11-10 21:05:57,bc1qyujm66rpnnfnn6zkm329gj75r6e73tk6zv5j50,1PMS23kYZv4VT3zPBSG6j8w4tjG9VVLr24 +2025-11-10 14:59:57,bc1qsnw4rsdg6tgrnrnxvjq5qnyy2j23w82um58tw9,bc1qd6t23tfrg82zpapv0tepvsm59r030mafdqx4pp +2025-11-10 21:05:57,bc1qr4v4nl2ay04tt2uz6lscy47wj277ntf8cee9rj,3Miq328XhYZVgKv8vbkxF4KxbGrFPnccA4 +2025-11-10 14:12:11,bc1qp74l26w7t27r4jedcsx3lcwn50cljwtqs0yan3kjhx8e42py26uq35f4j8,bc1qhh5ju5mu7w55p4zz3sfl938xuqu6zhdpnx9rlz8m3umqz8vkfltsmfeyt8 +2025-11-10 14:56:34,bc1qu4yaxft2tzuva4x2tukuwuus5d9hdzj02ttxnm,bc1qng6tdu6sc2aw4x42lmg6ntqtxd4ut43357dhhv +2025-11-10 05:47:18,bc1q4euuq7lcs2tnt8fgdxaylm8034q7snaekt7p24,3FN1CiruvagBhricdJMiySa1vD9h3emN3g +2025-11-10 20:57:25,bc1qy9ed5tc7vhs9dkad6hhkw8t4wh56jyg0czxmk5,bc1ql3xnnz4kw3wr080zrk7wysmzjz7sr3rjspt82a +2025-11-10 17:23:00,bc1qa5j77xexgsl46ahrzeuktv82g86qdalxn6e3at,bc1qvhket3vgyyy3yulclmr8efq4luwvy90a4qwjle +2025-11-10 20:57:25,bc1q2zsrq6vhkkzyr35hzkjgur38nhpj86jmez92cs,bc1qeldfg4ytnzsjk553mr2cdlk66usfz22pyz67dh +2025-11-10 14:56:34,bc1pseekhxtjhmvs0xlc5mzmqs0dl8xcjek26ujwpg6hqhkp3gc50f3qjuhlxp,bc1pv2tdl4x285n96a5e9hh36mt9xw098ktqz5hnywe86cph0pplwaqs56yw85 +2025-11-10 17:59:43,bc1q2gchdrgwlh5tzwm0spph77a7mj5pq0rc38aahk,bc1pcsc7utumn86cklrkzyusxdgc9e3fwj6mxu9j327vwts0cv8e2a3s92ddg4 +2025-11-10 12:17:32,bc1q0qfzuge7vr5s2xkczrjkccmxemlyyn8mhx298v,14pXPZVfyeL6gxKZDVagbQnxQhXMpM2Thb +2025-11-10 20:57:25,bc1qhcntczjrk7n83736ww45zrhhtxgwll023qy3eu,1ETdQMChucreiaNBjPDYV3R278ChRG3c7m +2025-11-10 17:23:00,3FkQ5nZWyHs7u63PgafTQ5jBK3TrLDcfRx,15qdGZi9vDYp7cADq2jqzpo6WvpVmX7d4p +2025-11-10 13:42:54,bc1ql6sgtq0uwh67un03dz4nt26n739qyu2xatgz92,bc1q7tpvm5d0yyv9lxme5y73s46n9cv803ue3gwevg +2025-11-10 00:28:09,bc1qv2gc72d0j8dmz530vuxlgyj25j4kz5s0050trj,bc1qv2gc72d0j8dmz530vuxlgyj25j4kz5s0050trj +2025-11-10 17:59:43,bc1qcr4jzax2wjyt5lkpqkhs6nuvrmfhmapsh9j8rp,bc1qe6asyl5njvyqc39qf2y7g5vqzscd9jysjwqs0k +2025-11-10 18:01:50,bc1q4y8s9l2yck6dvcejpmn90phdernngxcjwapqge,bc1qyktx2nxpjtrn07ftkpxsjv9c9atx7xh0nmrcdm +2025-11-10 14:12:11,bc1p6qvr6ceqgzmxdqrcrg2nzq39e3sd288sxa2reac6e285qq8epu0qgkdv9g,bc1p6qvr6ceqgzmxdqrcrg2nzq39e3sd288sxa2reac6e285qq8epu0qgkdv9g +2025-11-10 15:34:09,bc1qc5yxr9qkps7gfpkeg9xvptaz494n5eh4s00eru,bc1qcq5r4zg5f75ef0ps66nsvj3m03mjln9rcqjegm +2025-11-10 20:57:25,bc1q28f9lrqqaxly2jf2azfs36dyl7r6efcy8wkwew,bc1qz94dj90ymf87w77wfdpe7xyq6j9ngj6x4agyt2 +2025-11-10 14:56:34,bc1qtjuc2dqz34tkzs4uwame8rhyvpgge0gy6knhmy,1HYadqXeegRjnqDAYBkj92o5hF1pPJu4sb +2025-11-10 20:57:25,12rLYV7AQfpuxZpPXdfUqZCs7VCNp95qq8,bc1qu8xl7f8jkv2a58j0mas98r7gwecqqmnw6lkwt7 +2025-11-10 20:57:25,3FkTTwxagg6p7rs4d3GnQpZbADpatufQzo,bc1qfyfqzgvzxw2s2w7vms4yalys96aeqlq4rm0jxx +2025-11-10 17:23:00,3NhHCdt4RYXPjVQiYiWyRRyqqP7ik8SS9t,1636RnPVv6j8mTyaRmprjSpQCPAjD5UGiA +2025-11-10 17:23:00,bc1q5u709x2l7lsleprw264k5xj4rpmmmhhrurpkq7,bc1q7p54e7uarkxjlgc6qzwmugn7ygvmcwss262l50 +2025-11-10 20:29:45,bc1qrz8a6d4z2xnd2e3lnkd45v9jc5vd65t6a0pgzw,bc1qz4cfzstee7f208cxdca8v70ht5fcv5lypc63rv +2025-11-10 15:38:35,bc1q769n0hz7a9j038jdkwwd4lq3xwrkaskv6y8sp7,bc1qwrgtmyau0h2ar0guws8a9c0sa9vmjzqrlt7620 +2025-11-10 14:59:57,3M2nVoRZJgkxNHG7W7zp48xob11mbCTCKA,bc1p2d25ns4cf85dkk6jyytjeg4fcfv22lja6wwe77eltpwc2zd2yzqqyky6jm +2025-11-10 14:59:57,bc1pq7s7kpp90z2d4s7hzfxj32n72acd0987z3u2wm55ltst9fwelelq4emgpv,bc1pysearh9me9sa4kfkne6hu3jq96shjwdtrja2lf8uk4wn2uq5jjms72j3h4 +2025-11-10 17:23:00,bc1qjnmp4gxpp4u056dapqngwq2asw2ty74htrphx2,17Q8GjyhXZcf5RMmHGU7vznVvKpwGB88PQ +2025-11-10 05:47:18,1Nvv7ihqwz6Hh9rG4Bk64K7nGT7y1g2Wa7,bc1ql5zu6awgrz7wwwq6369kvc8fqz24n2ts6hn7y2 +2025-11-10 15:34:09,bc1q8a42mx0xfeyqy90zfkludfdu3c43w4a0jfw2ps,3QRLixkesAcqc268rikzbaRShSwSVydSSE +2025-11-10 14:56:34,bc1q5gpyv756638njr84s8uzeq3v59e97ha42hj52s,3FZLUQmcFTibssUKCJiNPtu9pXexXgoVun +2025-11-10 18:01:50,1LtjGorQ6FeNuC9S1oThWZQ6b79VvqH6Xp,1MqBMKLfsYq1xMwvwzZe39VAkXV3RYmNmk +2025-11-10 14:56:34,bc1pwjzpf2p4drax2mympx474phluyzjmpl7udnw9hmx43hxgc5w28vspms2c3,bc1qwrcm425acde6757923pxjkvlees9tww5xuqf0y +2025-11-10 14:12:11,bc1qrjkmdkhewjktx059nckpyqqlxazvr7kyeg57sllw7ksgenfwda2szu8kgy,bc1p2pxfzgune7ked0gldvt3j7zzusjv4t2uphhaf7pqg3srqdpuvngse97236 +2025-11-10 00:28:09,bc1qk90plyzwtzweulus6mmf9sd0zndplwqp26u8fy,bc1qeku5u5emyu6lgazyd6zntmfvkzffnk4yxm7l559hl0rku600798q533jtx +2025-11-10 14:12:11,bc1qmlj36ml4nay4tm5gj0h4u769uwpww6ucyqf0p2,bc1qp7ekeam9lauvcmltkhfxlyzd3udzmd39syyhh3 +2025-11-10 14:56:34,bc1q63qkttxua3aw9umnqqptkw2468rs5jne547umd,bc1q793gj5eml2u9hpgqm6y3xjgmfu669xy0aqzfa0 +2025-11-10 05:47:18,bc1qmptzs6czc3mlp6hy3932kka5687vn5fd9cnecu,bc1q9yn6zdkjjlh0z5y6sqpdvwq7pwkeh5r0ka28ad +2025-11-10 15:38:35,bc1q04qcmmnmd47dc8mjn9fcvxcjdk5f6edg6f40mw,3FxCJ2XUyFEup66QDaxgcHF8az6P19wien +2025-11-10 12:17:32,bc1q7t4vyehjsexdme84qhdgd4dawcn54djh0m78fz,bc1qcu0wrnx0002g2ka4sr0nnrldtff6dvq433unh2 +2025-11-10 20:29:45,bc1q3ms0mj7jtt9nd5smhv50uvd27czetxxlnlzkuq,bc1qcu3a5u765fdhddzccjy3k02uzewarlwd7yyn82u0fv42508n650s6w5um7 +2025-11-10 00:28:09,bc1q6dadscrdytuwjeedk9fr80xwmnl5prqvhwy7aga4k3fmxwhzvf6shuzpmh,bc1qdfcstw5dcud0dusq3tscs5khczqc7esn7psa3p +2025-11-10 20:29:45,bc1qtkya7nnflevqx2tgjajycw2gjl0y4w7626lad0,1ACE9sy42uw8Tns84KjueMj8koezrcZRdG +2025-11-10 20:57:25,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1p52jjweup7chageaggu4cj8jl3avylha8zrr7lgqkth82tancdrxqlhnvzm +2025-11-10 05:47:18,bc1q3dc2ec45m8s49u9rlv9y8ruyr644utc3hv7ncm,bc1quf7hjdq99rldlyqmxaz9sd3unm6j5fv6yulty2 +2025-11-10 15:38:35,bc1qsh343fpz0mtlfl3k4xzu5qru0uprdyh786lfsu,38qZsSoHitwa7XDicxT8xewyXPu2VAvhhh +2025-11-10 17:23:00,bc1qtgvxhy2armx5xajkfmdks5h8y7cqmqgwwfdrp2,bc1qtgvxhy2armx5xajkfmdks5h8y7cqmqgwwfdrp2 +2025-11-10 15:34:09,bc1qr7whs7crqjpqshv7qk9ex4742wagf9gnznjulc,bc1q20n8jugfv9c224fdfxe4vgugyd2gh7uaytt9kc +2025-11-10 00:28:09,bc1qnfw0gkk05qxl38mslc69hc6vc64mksyw6zzxhg,bc1qjhv7fcemmjx4temc4d2z500jv5fud2y3rtwwg3 +2025-11-10 20:29:45,bc1q3jzyfvu60rc3um5rah7y6j3gks3m7jffqpw9ef,bc1qc29w7mwejcuklrhqf0e8l6zjys6x4sqzkns2u4 +2025-11-10 20:29:45,3J8dPt32vzUdQzvwXpicG2XFcaG96dnRZt,bc1qre4tzdx5r8ckzhn9ffrxwusvugdfvee9n0v5y7 +2025-11-10 21:05:57,bc1q2dzekmutn0s8wh5ty9kywgddcl7j796zju8aql,bc1q3u7r770vyc5v6hae8v5pdv846wq430jhdfz40j +2025-11-10 20:57:25,32bZeQ89m2oPeM6wLKeYdvzsPNBDb3bGAP,3Psvpa4LQtEf2tR9i3VJwGRgHQsFPJ8rf1 +2025-11-10 00:28:09,1VEmWQLu9iohP6RMmabnKcDJuCkyk3E85,bc1qchpdg4wnyaswyfggfatrrwz9snasrc92wgzhfy +2025-11-10 20:29:45,bc1qlw4565huuxsr03dz3sepexjv6ujmfy2amye98d,bc1q87s3wsnzdhlclqpymykpkdm44ryv66av5fv08q +2025-11-10 05:47:18,3FfS44EtZhTBb1XXQPXcjiVqxpub9gncz8,bc1qft2zpj0wl4zqghk4lad6qr9www4zrrseuv0y5e +2025-11-10 20:29:45,bc1qsggexuj2xdmne5kvj2mnu4ur2m2qjpwlyrtqtf,bc1qfyaje5au3xzwcdmt2ecct5xneywrqaf4p46m22 +2025-11-10 14:12:11,bc1q4k8t9a9jrzhcnlyretxgz4kqc5hlyruvra5q5p,bc1qlyy7f7cu2rptc9n42khv3lxrc3pzwp58aa8qlp +2025-11-10 00:28:09,bc1qlwjqwjugrv5c5wzg3hmtj9m72tqj5mnqeazrzy,bc1qlac25q65m2wjz9fjzqg382txhycjc495gs6ez2 diff --git a/python/tests/data/btc_dataset/flattened_data.csv.bz2 b/python/tests/data/btc_dataset/flattened_data.csv.bz2 new file mode 100644 index 0000000000..bc0f6e04d9 Binary files /dev/null and b/python/tests/data/btc_dataset/flattened_data.csv.bz2 differ diff --git a/python/tests/data/btc_dataset/flattened_data.csv.gz b/python/tests/data/btc_dataset/flattened_data.csv.gz new file mode 100644 index 0000000000..1b8abde845 Binary files /dev/null and b/python/tests/data/btc_dataset/flattened_data.csv.gz differ diff --git a/python/tests/data/btc_dataset/flattened_data.parquet b/python/tests/data/btc_dataset/flattened_data.parquet new file mode 100644 index 0000000000..2102af87e8 Binary files /dev/null and b/python/tests/data/btc_dataset/flattened_data.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/extra_field.csv b/python/tests/data/btc_dataset/malformed_files/extra_field.csv new file mode 100644 index 0000000000..345e506560 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/extra_field.csv @@ -0,0 +1,2 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 00:28:09,bc1qabc,bc1qdef,EXTRA \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/impossible_date.csv b/python/tests/data/btc_dataset/malformed_files/impossible_date.csv new file mode 100644 index 0000000000..bafdb9c919 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/impossible_date.csv @@ -0,0 +1,2 @@ +block_timestamp,inputs_address,outputs_address +2025-99-99 99:99:99,bc1qabc,bc1qdef \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/impossible_date.parquet b/python/tests/data/btc_dataset/malformed_files/impossible_date.parquet new file mode 100644 index 0000000000..e37569cc64 Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/impossible_date.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/missing_field.csv b/python/tests/data/btc_dataset/malformed_files/missing_field.csv new file mode 100644 index 0000000000..ffd157fb94 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/missing_field.csv @@ -0,0 +1,2 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 00:28:09,bc1qabc \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/missing_field.parquet b/python/tests/data/btc_dataset/malformed_files/missing_field.parquet new file mode 100644 index 0000000000..deaaf7b6f5 Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/missing_field.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/missing_id_col.csv b/python/tests/data/btc_dataset/malformed_files/missing_id_col.csv new file mode 100644 index 0000000000..5e8f04854f --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/missing_id_col.csv @@ -0,0 +1,2 @@ +block_timestamp,outputs_address +2025-11-10 00:28:09,bc1qdef \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/missing_id_col.parquet b/python/tests/data/btc_dataset/malformed_files/missing_id_col.parquet new file mode 100644 index 0000000000..66569327bf Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/missing_id_col.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/missing_prop_col.csv b/python/tests/data/btc_dataset/malformed_files/missing_prop_col.csv new file mode 100644 index 0000000000..54da35f624 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/missing_prop_col.csv @@ -0,0 +1,2 @@ +block_timestamp,inputs_address +2025-11-10 00:28:09,bc1qabc \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/missing_prop_col.parquet b/python/tests/data/btc_dataset/malformed_files/missing_prop_col.parquet new file mode 100644 index 0000000000..e05399d4b2 Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/missing_prop_col.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/missing_timestamp_col.csv b/python/tests/data/btc_dataset/malformed_files/missing_timestamp_col.csv new file mode 100644 index 0000000000..8ef862e666 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/missing_timestamp_col.csv @@ -0,0 +1,11 @@ +inputs_address,outputs_address +bc1q40a3a5xlwv0l4n4xs3mpvauscgkj78rcuhvrhs,bc1q9xh9exh34tq566nfvpwwupum6efh3yqc0aja5d +bc1q2zn0548tc0njq9xxt0uamxxh48s8t7c9dw0a45,3K1t5MthuwcyNn4BWHp4eFSyTY8ifDjTkk +bc1qxsp4nqu7zq424wdz4v7c6q8jylhtlskap86kpd,36jsyuJHY2Eqs48yjUb6xyzARSPYKKSrTF +bc1qvpah3cjguuwmjjgwp2g9t52f786jnw7km3jgau,34WkZeqyNBb5FXMxLZJfveHNm2kqngt4w4 +bc1q6xku4eta99267etxf84yy3qc2pyw04pq8dvqye,bc1q6xku4eta99267etxf84yy3qc2pyw04pq8dvqye +bc1q50uxvgu5paqj22fg27s352wggev70xs9f8lhnk,bc1qxgxf44zkxrz8ld33kjvf85ekmmkcaexs7xkfyg +bc1q5v45pufhugrqmp9a7taq92y4uqu8awpqcsrjq2,33bdcPeLq1Mk8TrHZ6sPwjpNbfHf6Y5UCg +bc1qhkar0wasylsfhwamp8d023e2tqzcjfzjlx6wrd,bc1q4mhkgdfmattuya75rescq5u3ms4u3m56x8nzcz +bc1qw8wrek2m7nlqldll66ajnwr9mh64syvkt67zlu,bc1qxwea59rqnldu5hl2zg3pt9c0av7eee4dfjnj7u +bc1qn5yd54kvaa60jqw0ncx93ru7ctlk0727653ynz,bc1qehhvcqru3k3qfn95r9w3nf8jxnlflnw4d9duah diff --git a/python/tests/data/btc_dataset/malformed_files/missing_timestamp_col.parquet b/python/tests/data/btc_dataset/malformed_files/missing_timestamp_col.parquet new file mode 100644 index 0000000000..1edb128ead Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/missing_timestamp_col.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/null_id.csv b/python/tests/data/btc_dataset/malformed_files/null_id.csv new file mode 100644 index 0000000000..0dbc5578ba --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/null_id.csv @@ -0,0 +1,2 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 00:28:09,,bc1qdef \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/null_id.parquet b/python/tests/data/btc_dataset/malformed_files/null_id.parquet new file mode 100644 index 0000000000..bfe976cb3b Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/null_id.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/null_timestamp.csv b/python/tests/data/btc_dataset/malformed_files/null_timestamp.csv new file mode 100644 index 0000000000..686da66859 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/null_timestamp.csv @@ -0,0 +1,2 @@ +block_timestamp,inputs_address,outputs_address +,bc1qabc,bc1qdef \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/null_timestamp.parquet b/python/tests/data/btc_dataset/malformed_files/null_timestamp.parquet new file mode 100644 index 0000000000..b775f4d7ae Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/null_timestamp.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/out_of_range_timestamp.csv b/python/tests/data/btc_dataset/malformed_files/out_of_range_timestamp.csv new file mode 100644 index 0000000000..4b41edc6a4 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/out_of_range_timestamp.csv @@ -0,0 +1,2 @@ +block_timestamp,inputs_address,outputs_address +999999999999999999999,bc1qabc,bc1qdef \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/out_of_range_timestamp.parquet b/python/tests/data/btc_dataset/malformed_files/out_of_range_timestamp.parquet new file mode 100644 index 0000000000..32682d4893 Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/out_of_range_timestamp.parquet differ diff --git a/python/tests/data/btc_dataset/malformed_files/semicolon_delimiter.csv b/python/tests/data/btc_dataset/malformed_files/semicolon_delimiter.csv new file mode 100644 index 0000000000..d02492edf6 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/semicolon_delimiter.csv @@ -0,0 +1,2 @@ +block_timestamp;inputs_address;outputs_address +2025-11-10 00:28:09;bc1qabc;bc1qdef \ No newline at end of file diff --git a/python/tests/data/btc_dataset/malformed_files/timestamp_malformed.csv b/python/tests/data/btc_dataset/malformed_files/timestamp_malformed.csv new file mode 100644 index 0000000000..9e3ab079f5 --- /dev/null +++ b/python/tests/data/btc_dataset/malformed_files/timestamp_malformed.csv @@ -0,0 +1,11 @@ +block_timestamp,inputs_address +not-a-timestamp,bc1q40a3a5xlwv0l4n4xs3mpvauscgkj78rcuhvrhs +not-a-timestamp,bc1q2zn0548tc0njq9xxt0uamxxh48s8t7c9dw0a45 +not-a-timestamp,bc1qxsp4nqu7zq424wdz4v7c6q8jylhtlskap86kpd +not-a-timestamp,bc1qvpah3cjguuwmjjgwp2g9t52f786jnw7km3jgau +not-a-timestamp,bc1q6xku4eta99267etxf84yy3qc2pyw04pq8dvqye +not-a-timestamp,bc1q50uxvgu5paqj22fg27s352wggev70xs9f8lhnk +not-a-timestamp,bc1q5v45pufhugrqmp9a7taq92y4uqu8awpqcsrjq2 +not-a-timestamp,bc1qhkar0wasylsfhwamp8d023e2tqzcjfzjlx6wrd +not-a-timestamp,bc1qw8wrek2m7nlqldll66ajnwr9mh64syvkt67zlu +not-a-timestamp,bc1qn5yd54kvaa60jqw0ncx93ru7ctlk0727653ynz diff --git a/python/tests/data/btc_dataset/malformed_files/timestamp_malformed.parquet b/python/tests/data/btc_dataset/malformed_files/timestamp_malformed.parquet new file mode 100644 index 0000000000..6547644427 Binary files /dev/null and b/python/tests/data/btc_dataset/malformed_files/timestamp_malformed.parquet differ diff --git a/python/tests/data/btc_dataset/mixed_directory/part1.parquet b/python/tests/data/btc_dataset/mixed_directory/part1.parquet new file mode 100644 index 0000000000..1d361552a3 Binary files /dev/null and b/python/tests/data/btc_dataset/mixed_directory/part1.parquet differ diff --git a/python/tests/data/btc_dataset/mixed_directory/part2.csv b/python/tests/data/btc_dataset/mixed_directory/part2.csv new file mode 100644 index 0000000000..b606b24e44 --- /dev/null +++ b/python/tests/data/btc_dataset/mixed_directory/part2.csv @@ -0,0 +1,51 @@ +block_timestamp,inputs_address,outputs_address +2025-11-10 17:59:43,bc1qthv7smt28rt2ctyg5ec3vkvx0hl4ckks9m75c3,bc1q2g6nxqvv2s09plap4x4j6pgnc2y42lmjy8fpxf +2025-11-10 20:29:45,bc1qt9nvg8mwuqmjlruc8z9m4kns2hd3gk2ylsdryp,3JDMdeEtsEfnABR6AYaHNU8fy9B7uDJduH +2025-11-10 12:17:32,bc1q4996ykxqxnhey7sh9fzvn3rre3mswta69myft7,bc1qlxsa2yndh2xt4rpmw5pt7eqaelkzd54psvx4xa +2025-11-10 05:47:18,bc1q449ruh9ga8pzd4uzds83zqckwp6yqxfvra7fvq645ev7sqefj9jsdkpegd,bc1p5xpw59psu6yyzmee8s8ptqy6gvpfls0zsegp5krw0x6dp2mtru2qcxq0tk +2025-11-10 12:17:32,3FXf66gb6NNMA52WSBBjJyMmJ3t75hLRDM,1EHF3NhZPLWEaEppSHqK5AVGB8zX5xTXEm +2025-11-10 17:23:00,bc1q4pmrg0q6ywlsd4vutmmv65g7n4mgp7tf2hz9gx,3Hqj4K5kofSnRa8MjJPt43yw5gd5f5sdKG +2025-11-10 14:59:57,bc1q3aus5ka7pq2wcjuesxqlaqwhxgckxxlx6t6ejy,bc1qwl8jvjvkyeun7wj09xkcpwt4g7sqajlzush8j2 +2025-11-10 15:38:35,bc1qh2cwxv899psevx5yhr9cvuvqpcwqr3ej4ns87a,bc1qkuwtp76a89y40nvp6235xh9u5rpumv0arpm8rq +2025-11-10 14:12:11,bc1qa7a3ndesd0n2srxzdhnkksu0hkc96rp5uyyq95,bc1q3e0xr3pqz0rad4uakctg7j2crnf3v53na9c567 +2025-11-10 14:12:11,bc1qvxs7r6w2z3aguqzdecql7t5vkr44q9gccgetyn,bc1q0qdfdjjp9w872r406tlff34tdy9wa7wvdgutd9 +2025-11-10 14:56:34,bc1qsw6zezpduq9985vxs48uhffv82jzxgl8rm5mpd,bc1qsw6zezpduq9985vxs48uhffv82jzxgl8rm5mpd +2025-11-10 05:47:18,bc1q2alxcv39eku3cvctdudkqf2y2fvqu2t49wlrj2,bc1q2alxcv39eku3cvctdudkqf2y2fvqu2t49wlrj2 +2025-11-10 12:17:32,bc1qm34lsc65zpw79lxes69zkqmk6ee3ewf0j77s3h,bc1qkqd2ewnd8mw358qw24vmwgepz8n8cta0y2pe97 +2025-11-10 05:47:18,1AGD42b91ycvAcMrCCQ1G5iiBZooPP9xqX,1AGD42b91ycvAcMrCCQ1G5iiBZooPP9xqX +2025-11-10 20:57:25,bc1qmv76fxychuzcuvt3uy5uy3seu6jvxrsfz4993w,14s7DCukciJLAFJsYDzPK7ZfFP2thzU1Yt +2025-11-10 14:56:34,35C2L1pCgwzBHNcDcVL1a5RuoefeWqyjAR,bc1qhfsad8wuvke9eczj5g4e287hz93g7t8nwn9gxj +2025-11-10 15:34:09,3HYjQZaytMNixSZwU6Dkd7QBQhs2yFcp3E,14BuunTtFx9HMvgJzJQnxR5VNgjXCKckSv +2025-11-10 21:05:57,bc1qx54ge94tzqtfjzqy0dm9z9q6yqdm75v3ed8g39,166dELRZqWwRjXXVdbahWLi7muS9A2jjJv +2025-11-10 14:59:57,bc1pthxvxwdparr9pg5s9dxl4s4u3f744qwn9atl3gcltvjx76hyg4kqx230gh,bc1pthxvxwdparr9pg5s9dxl4s4u3f744qwn9atl3gcltvjx76hyg4kqx230gh +2025-11-10 14:59:57,38Vfgc9RT5EptuLMQTrnJ65eZgQtDpneom,bc1qeq2flx8mfj3gn3j58uwaj5u2xzpt8qmvet46nl +2025-11-10 14:12:11,bc1q7cyrfmck2ffu2ud3rn5l5a8yv6f0chkp0zpemf,bc1qx0m6mzl4756vwg23jxkdpew03wwfze856phhxl +2025-11-10 17:23:00,bc1qkqw4qj5gplgkns6ysg6xd6986a4xrwpg9fsy36,bc1q9stxzl5x02rrq0cfmlfh4epumn6zvq6asa9n0u +2025-11-10 00:28:09,bc1q6nlqnu6jad5fygq7v5kua2zpr2h5x288szq5km,35AtUZgvWh9uAHhX6fMidJcbVTJTjC4Mcs +2025-11-10 14:56:34,bc1qm2safrrv8e6696sgtksv36yg066qcafc39mu3y,34Yq1C3TS1V5S8w3CNx3SN2W3CdjoGu9QN +2025-11-10 14:12:11,1GbUhB5Aub17yjLaFf1fPNFN8icPzXn3sU,bc1qw3qld7muf6feg5jlypcdfrmq4hzanx92xagswe +2025-11-10 05:47:18,1NfJSiqBw4fb74KgVQrPsk5W5aqitAD1Xv,1L6yJi7TcjztgX7W8ds1zz2NGEnT7GdoAz +2025-11-10 05:47:18,bc1qd9gg9qhkswtp26kujvc29zc6daj4yzv6qsgur3,3MqsziNKsuFwM9unjhAsKYZCQ7mNfYTSv5 +2025-11-10 20:57:25,bc1qdvdt8vjn4lpep3hr6kr90uwmjr7rf906ac5lxn,bc1qqz5ll7nwghwjg9e9d0wjpx54cs5rc6uu48emtf +2025-11-10 12:17:32,37jdMXYbvg3dKzJ4pGSYiABiXoBy4putZq,19WpQ6KYi2SGKRnsgkeX5ALgZjgSMPiqmu +2025-11-10 15:34:09,bc1qz9t7rmkf3wn3yuz48e0fhg8dvzf7hsmf3a6at9,3R1Ku4KBj6y9ekKkCJXnrVkUzGNjZtEXah +2025-11-10 15:38:35,bc1qmkavqx59gg4aucmctctww3nve6x9keln9s07zz,1HzijFYeyDDUKym1siU5hBiuyu9WVRPcVj +2025-11-10 20:57:25,bc1qkplg5aqltwln2ks4shezddemdffh9pmt73xnpd,bc1qgx3c07j5enqz0pfwpkchpx6cwh690zst9l4dz0 +2025-11-10 00:28:09,bc1qt2xxqsy0tnvcvml47t4ugrvm8p8h9skkv886uv,bc1q2sa6jadt6gry89778csh2lmu2xaw2javvq9srn +2025-11-10 20:57:25,bc1qlpygchhl3j07mh9z7gcsqzjfapyyurm9amvhh8,bc1q0z4v89tecuy5e5cr3hdj6ts8zz0ky79mmattsv +2025-11-10 21:05:57,15QKr87rKdX8g7kmJ2DBWEgbWiGrnxBTnM,3Ae1fKTuLvPNTqjAsuXznf6cFySEtcArho +2025-11-10 15:34:09,bc1q2z994nye47fnwxxy4nwukfg9kkq0m5xe7hxpq2,bc1q4saygf3hk4cl0hej8e5rr2wpdza8zga6fqshqt8mrlzgpt348chqgs532d +2025-11-10 20:57:25,bc1qagt6ng896jhghzuxhqzrcmkn0nq7tzpjejghgz,bc1q3rqkcktkqnem5yfzmrzya289m2xjl9vl37fgmxdwkhl3n7f9l35qk2u36p +2025-11-10 00:28:09,bc1qwthqxlv39qwxt6j5c5zdhxprjfkpy4qgre6nvl,bc1qc3h9tdncgalkv3yeaw5fxsn0ktdavxmvzmcm09ge4k59psxxx60qmwcmx6 +2025-11-10 20:57:25,bc1qlll42hhmtn7pwz6srexps6v8zm2tqp2p7tx4pt,bc1qzheza7hkc4jyp67vhtgrkaxpddwl3z0eada6fe +2025-11-10 14:12:11,bc1pah7lzc5rcms23lvnsrsj68atcagcg25j6kzlk7aytjtludwrl5lqclvw3t,bc1pmspgjsaxqfdvzkg7du7sgpclldwf0fajrkn36qgkrat99ugjz2ushjvq57 +2025-11-10 18:01:50,bc1qmgv6leqa0rpu40ycpndxg9hggrqdxcdgtmvp34,18N8SP9Ui6WKLtDWks6DfwMMMeJT4W5TDB +2025-11-10 17:23:00,bc1q65h44f9taql5ew5nzd2xzkmq35wcag2uugvj82,bc1qhzvug7trqm7n36g5azdqa968rl9ww5rvwzt69w +2025-11-10 20:57:25,bc1qrgl0yu3zuglvcgrdsglc5a8zdsud6n023naqaz,bc1qlg7shfvat4p0fg0n9rzap3qxryu45sljuxy4qp +2025-11-10 14:56:34,bc1q4dd592zay7mpw9arzj02pr4hr6e5yj603r29sy,bc1qepmkx8n0flmw3qq9puwwj0rpeq4atxe8tktef2 +2025-11-10 17:23:00,bc1qp68ctlw77lkyqjyxfn200ergk079jqpxny4yj2,bc1qp68ctlw77lkyqjyxfn200ergk079jqpxny4yj2 +2025-11-10 14:12:11,bc1qa3tac82ncmpd34cy2g7a0xvhm2eh9wec5dcnl0,bc1q6f3kdsu9jxvk4uv62nzmdfcmy0pu6k7y7p7w3f +2025-11-10 17:59:43,bc1qdkkaprszrgxmn6umkrs7fzufpzrefdzz7p5heu,bc1qc2qw3a5m035jcaughewp8w2mst0w788fym5fmn +2025-11-10 13:42:54,bc1qryhgpmfv03qjhhp2dj8nw8g4ewg08jzmgy3cyx,bc1q08hcxrtxl28erd7tmevja57u8af76at6qukmfu +2025-11-10 14:56:34,bc1qjq2l9469dqklhejlvkr3va9qrd343mwrtzu4v4,bc1qlnrcexfn0z6n64zq8jj7ulful6y76gs5yvme3w +2025-11-10 20:29:45,bc1qkhl6epe73jec02jv3sp2lvdfhppyg874wxs9l9,bc1q7t4vyehjsexdme84qhdgd4dawcn54djh0m78fz diff --git a/python/tests/data/btc_dataset/mixed_directory/part3.parquet b/python/tests/data/btc_dataset/mixed_directory/part3.parquet new file mode 100644 index 0000000000..ac023bec11 Binary files /dev/null and b/python/tests/data/btc_dataset/mixed_directory/part3.parquet differ diff --git a/python/tests/data/btc_dataset/mixed_directory/part4.csv.gz b/python/tests/data/btc_dataset/mixed_directory/part4.csv.gz new file mode 100644 index 0000000000..210234f12b Binary files /dev/null and b/python/tests/data/btc_dataset/mixed_directory/part4.csv.gz differ diff --git a/python/tests/data/btc_dataset/parquet_directory/part1.parquet b/python/tests/data/btc_dataset/parquet_directory/part1.parquet new file mode 100644 index 0000000000..1d361552a3 Binary files /dev/null and b/python/tests/data/btc_dataset/parquet_directory/part1.parquet differ diff --git a/python/tests/data/btc_dataset/parquet_directory/part2.parquet b/python/tests/data/btc_dataset/parquet_directory/part2.parquet new file mode 100644 index 0000000000..a21974c7ac Binary files /dev/null and b/python/tests/data/btc_dataset/parquet_directory/part2.parquet differ diff --git a/python/tests/data/btc_dataset/parquet_directory/part3.parquet b/python/tests/data/btc_dataset/parquet_directory/part3.parquet new file mode 100644 index 0000000000..ac023bec11 Binary files /dev/null and b/python/tests/data/btc_dataset/parquet_directory/part3.parquet differ diff --git a/python/tests/data/btc_dataset/parquet_directory/part4.parquet b/python/tests/data/btc_dataset/parquet_directory/part4.parquet new file mode 100644 index 0000000000..884b3fe402 Binary files /dev/null and b/python/tests/data/btc_dataset/parquet_directory/part4.parquet differ diff --git a/python/tests/test_base_install/test_graphdb/test_graphdb.py b/python/tests/test_base_install/test_graphdb/test_graphdb.py index 51db3f2c56..6d4ab07448 100644 --- a/python/tests/test_base_install/test_graphdb/test_graphdb.py +++ b/python/tests/test_base_install/test_graphdb/test_graphdb.py @@ -2941,7 +2941,7 @@ def test_NaN_NaT_as_properties(): df = pd.DataFrame(data) g = Graph() - g.load_nodes_from_pandas(time="time", id="id", df=df, properties=["floats"]) + g.load_nodes(time="time", id="id", data=df, properties=["floats"]) @with_disk_graph def check(g): diff --git a/python/tests/test_ingestion_equivalence_df.py b/python/tests/test_ingestion_equivalence_df.py index 5c04b6d728..823a5f3c5c 100644 --- a/python/tests/test_ingestion_equivalence_df.py +++ b/python/tests/test_ingestion_equivalence_df.py @@ -51,18 +51,7 @@ def dataframes(): def test_edge_ingestion_equivalence(dataframes, graph_type): # reference graph g_pd = graph_type() - g_pd.load_edges_from_pandas( - df=dataframes["pandas"]["edges"], - time="timestamp", - src="source", - dst="destination", - properties=["data_size_MB", "transaction_type"], - metadata=["is_encrypted"], - ) - - # Pandas streaming - g_pd_stream = graph_type() - g_pd_stream.load_edges_from_df( + g_pd.load_edges( data=dataframes["pandas"]["edges"], time="timestamp", src="source", @@ -70,13 +59,10 @@ def test_edge_ingestion_equivalence(dataframes, graph_type): properties=["data_size_MB", "transaction_type"], metadata=["is_encrypted"], ) - assert ( - g_pd == g_pd_stream - ), "Pandas streaming edge ingestion failed equivalence check" # Polars g_pl = graph_type() - g_pl.load_edges_from_df( + g_pl.load_edges( data=dataframes["polars"]["edges"], time="timestamp", src="source", @@ -88,7 +74,7 @@ def test_edge_ingestion_equivalence(dataframes, graph_type): # Arrow g_arrow = graph_type() - g_arrow.load_edges_from_df( + g_arrow.load_edges( data=dataframes["arrow"]["edges"], time="timestamp", src="source", @@ -100,7 +86,7 @@ def test_edge_ingestion_equivalence(dataframes, graph_type): # DuckDB g_duckdb = graph_type() - g_duckdb.load_edges_from_df( + g_duckdb.load_edges( data=dataframes["duckdb"]["edges"], time="timestamp", src="source", @@ -113,7 +99,7 @@ def test_edge_ingestion_equivalence(dataframes, graph_type): if fpd: # FireDucks g_fd = graph_type() - g_fd.load_edges_from_df( + g_fd.load_edges( data=dataframes["fireducks"]["edges"], time="timestamp", src="source", @@ -128,30 +114,17 @@ def test_edge_ingestion_equivalence(dataframes, graph_type): def test_node_ingestion_equivalence(dataframes, graph_type): # reference graph g_pd = graph_type() - g_pd.load_nodes_from_pandas( - df=dataframes["pandas"]["nodes"], - time="timestamp", - id="server_id", - properties=["OS_version", "uptime_days"], - metadata=["primary_function", "server_name", "hardware_type"], - ) - - # Pandas streaming - g_pd_stream = graph_type() - g_pd_stream.load_nodes_from_df( + g_pd.load_nodes( data=dataframes["pandas"]["nodes"], time="timestamp", id="server_id", properties=["OS_version", "uptime_days"], metadata=["primary_function", "server_name", "hardware_type"], ) - assert ( - g_pd == g_pd_stream - ), "Pandas streaming node ingestion failed equivalence check" # Polars g_pl = graph_type() - g_pl.load_nodes_from_df( + g_pl.load_nodes( data=dataframes["polars"]["nodes"], time="timestamp", id="server_id", @@ -162,7 +135,7 @@ def test_node_ingestion_equivalence(dataframes, graph_type): # Arrow g_arrow = graph_type() - g_arrow.load_nodes_from_df( + g_arrow.load_nodes( data=dataframes["arrow"]["nodes"], time="timestamp", id="server_id", @@ -173,7 +146,7 @@ def test_node_ingestion_equivalence(dataframes, graph_type): # DuckDB g_duckdb = graph_type() - g_duckdb.load_nodes_from_df( + g_duckdb.load_nodes( data=dataframes["duckdb"]["nodes"], time="timestamp", id="server_id", @@ -186,7 +159,7 @@ def test_node_ingestion_equivalence(dataframes, graph_type): # FireDucks print("Testing fireducks...") g_fd = graph_type() - g_fd.load_nodes_from_df( + g_fd.load_nodes( data=dataframes["fireducks"]["nodes"], time="timestamp", id="server_id", @@ -200,79 +173,50 @@ def test_node_ingestion_equivalence(dataframes, graph_type): def test_metadata_update_equivalence(dataframes, graph_type): # reference graph g_pd = graph_type() - g_pd.load_edges_from_pandas( - df=dataframes["pandas"]["edges"], - time="timestamp", - src="source", - dst="destination", - ) - g_pd.load_nodes_from_pandas( - df=dataframes["pandas"]["nodes"], - time="timestamp", - id="server_id", - ) - # update metadata - g_pd.load_node_props_from_pandas( - df=dataframes["pandas"]["nodes"], - id="server_id", - metadata=["primary_function", "server_name", "hardware_type"], - ) - g_pd.load_edge_props_from_pandas( - df=dataframes["pandas"]["edges"], - src="source", - dst="destination", - metadata=["is_encrypted"], - ) - - # Pandas streaming - g_pd_stream = graph_type() - g_pd_stream.load_edges_from_df( + g_pd.load_edges( data=dataframes["pandas"]["edges"], time="timestamp", src="source", dst="destination", ) - g_pd_stream.load_nodes_from_df( + g_pd.load_nodes( data=dataframes["pandas"]["nodes"], time="timestamp", id="server_id", ) # update metadata - g_pd_stream.load_node_metadata_from_df( + g_pd.load_node_metadata( data=dataframes["pandas"]["nodes"], id="server_id", metadata=["primary_function", "server_name", "hardware_type"], ) - g_pd_stream.load_edge_metadata_from_df( + g_pd.load_edge_metadata( data=dataframes["pandas"]["edges"], src="source", dst="destination", metadata=["is_encrypted"], ) - assert ( - g_pd == g_pd_stream - ), "Pandas streaming metadata ingestion failed equivalence check" # Polars g_pl = graph_type() - g_pl.load_edges_from_df( + g_pl.load_edges( data=dataframes["polars"]["edges"], time="timestamp", src="source", dst="destination", ) - g_pl.load_nodes_from_df( + g_pl.load_nodes( data=dataframes["polars"]["nodes"], time="timestamp", id="server_id", ) # update metadata - g_pl.load_node_metadata_from_df( + g_pl.load_node_metadata( data=dataframes["polars"]["nodes"], id="server_id", metadata=["primary_function", "server_name", "hardware_type"], ) - g_pl.load_edge_metadata_from_df( + g_pl.load_edge_metadata( data=dataframes["polars"]["edges"], src="source", dst="destination", @@ -282,24 +226,24 @@ def test_metadata_update_equivalence(dataframes, graph_type): # Arrow g_arrow = graph_type() - g_arrow.load_edges_from_df( + g_arrow.load_edges( data=dataframes["arrow"]["edges"], time="timestamp", src="source", dst="destination", ) - g_arrow.load_nodes_from_df( + g_arrow.load_nodes( data=dataframes["arrow"]["nodes"], time="timestamp", id="server_id", ) # update metadata - g_arrow.load_node_metadata_from_df( + g_arrow.load_node_metadata( data=dataframes["arrow"]["nodes"], id="server_id", metadata=["primary_function", "server_name", "hardware_type"], ) - g_arrow.load_edge_metadata_from_df( + g_arrow.load_edge_metadata( data=dataframes["arrow"]["edges"], src="source", dst="destination", @@ -309,24 +253,24 @@ def test_metadata_update_equivalence(dataframes, graph_type): # DuckDB g_duckdb = graph_type() - g_duckdb.load_edges_from_df( + g_duckdb.load_edges( data=dataframes["duckdb"]["edges"], time="timestamp", src="source", dst="destination", ) - g_duckdb.load_nodes_from_df( + g_duckdb.load_nodes( data=dataframes["duckdb"]["nodes"], time="timestamp", id="server_id", ) # update metadata - g_duckdb.load_node_metadata_from_df( + g_duckdb.load_node_metadata( data=dataframes["duckdb"]["nodes"], id="server_id", metadata=["primary_function", "server_name", "hardware_type"], ) - g_duckdb.load_edge_metadata_from_df( + g_duckdb.load_edge_metadata( data=dataframes["duckdb"]["edges"], src="source", dst="destination", @@ -337,24 +281,24 @@ def test_metadata_update_equivalence(dataframes, graph_type): if fpd: # FireDucks g_fd = graph_type() - g_fd.load_edges_from_df( + g_fd.load_edges( data=dataframes["fireducks"]["edges"], time="timestamp", src="source", dst="destination", ) - g_fd.load_nodes_from_df( + g_fd.load_nodes( data=dataframes["fireducks"]["nodes"], time="timestamp", id="server_id", ) # update metadata - g_fd.load_node_metadata_from_df( + g_fd.load_node_metadata( data=dataframes["fireducks"]["nodes"], id="server_id", metadata=["primary_function", "server_name", "hardware_type"], ) - g_fd.load_edge_metadata_from_df( + g_fd.load_edge_metadata( data=dataframes["fireducks"]["edges"], src="source", dst="destination", diff --git a/python/tests/test_load_from_df.py b/python/tests/test_load_from_df.py index 968876d597..16ec8ffd2a 100644 --- a/python/tests/test_load_from_df.py +++ b/python/tests/test_load_from_df.py @@ -1,5 +1,9 @@ +from pathlib import Path + import polars as pl -from raphtory import Graph, PersistentGraph +import pandas as pd +import pyarrow as pa +from raphtory import Graph, PersistentGraph, PropType import pytest try: @@ -24,12 +28,12 @@ def test_load_edges_from_polars_df(graph_type): ) g_to_pandas = graph_type() - g_to_pandas.load_edges_from_pandas( - df=df.to_pandas(), time="time", src="src", dst="dst", properties=["value"] + g_to_pandas.load_edges( + data=df.to_pandas(), time="time", src="src", dst="dst", properties=["value"] ) g_from_df = graph_type() - g_from_df.load_edges_from_df( + g_from_df.load_edges( data=df, time="time", src="src", dst="dst", properties=["value"] ) @@ -38,6 +42,683 @@ def test_load_edges_from_polars_df(graph_type): assert _collect_edges(g_to_pandas) == expected assert _collect_edges(g_from_df) == expected +def test_different_data_sources(): + nodes_list = [] + + ######### PARQUET ######### + parquet_dir_path_str = str(_btc_root() / "parquet_directory") + parquet_file_path_str = str(_btc_root() / "flattened_data.parquet") + # test path string for parquet file + g = Graph() + g.load_nodes(data=parquet_file_path_str, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test Path object for parquet file + file_path_obj = Path(parquet_file_path_str) + g = Graph() + g.load_nodes(data=file_path_obj, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test path string for parquet directory + g = Graph() + g.load_nodes(data=parquet_dir_path_str, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test Path object for parquet directory + dir_path_obj = Path(parquet_dir_path_str) + g = Graph() + g.load_nodes(data=dir_path_obj, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + ######### CSV ######### + csv_dir_path_str = str(_btc_root() / "csv_directory") + csv_file_path_str = str(_btc_root() / "flattened_data.csv") + # test path string for CSV file + g = Graph() + g.load_nodes(data=csv_file_path_str, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test Path object for CSV file + file_path_obj = Path(csv_file_path_str) + g = Graph() + g.load_nodes(data=file_path_obj, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test path string for bz2 compressed CSV file + g = Graph() + compressed_file_path = csv_file_path_str + ".bz2" + g.load_nodes(data=compressed_file_path, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test Path object for bz2 compressed CSV file + file_path_obj = Path(compressed_file_path) + g = Graph() + g.load_nodes(data=file_path_obj, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test path string for gzip compressed CSV file + g = Graph() + compressed_file_path = csv_file_path_str + ".gz" + g.load_nodes(data=compressed_file_path, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test Path object for gzip compressed CSV file + file_path_obj = Path(compressed_file_path) + g = Graph() + g.load_nodes(data=file_path_obj, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test path string for CSV directory + g = Graph() + g.load_nodes(data=csv_dir_path_str, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test Path object for CSV directory + dir_path_obj = Path(csv_dir_path_str) + g = Graph() + g.load_nodes(data=dir_path_obj, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + ######### mixed directory ######### + mixed_dir_path_str = str(Path(__file__).parent) + "/data/btc_dataset/mixed_directory" + # test path string + g = Graph() + g.load_nodes(data=mixed_dir_path_str, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + # test Path object + g = Graph() + g.load_nodes(data=Path(mixed_dir_path_str), time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g + + ######### arrow_c_stream ######### + # test pandas + df_pd = pd.read_parquet(parquet_file_path_str) + g = Graph() + g.load_nodes(data=df_pd, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g, df_pd + + # test polars + df_pl = pl.read_parquet(parquet_file_path_str) + g = Graph() + g.load_nodes(data=df_pl, time="block_timestamp", id="inputs_address") + nodes_list.append(sorted(g.nodes.id.collect())) + del g, df_pl + + # sanity check, make sure we ingested the same nodes each time + print(f"Number of tests ran: {len(nodes_list)}") + for i in range(len(nodes_list)-1): + assert nodes_list[0] == nodes_list[i+1], f"Nodes list assertion failed at item i={i}" + +def test_schema_casting(): + # time/id as regular ints (I64), value column as explicit int32 + df = pd.DataFrame( + { + "time": pd.Series([1, 2, 3], dtype="int64"), + "id": pd.Series([10, 20, 30], dtype="int64"), + "val_i32": pd.Series([1, 2, 3], dtype="int32"), + } + ) + g = Graph() + # No casting + g.load_nodes( + data=df, + time="time", + id="id", + properties=["val_i32"], + ) + n_prop_dtype = g.node(10).properties.get_dtype_of("val_i32") + assert n_prop_dtype == PropType.i32() + del g, n_prop_dtype + + # Cast the val_i32 column to I64 using PropType.i64() + g = Graph() + g.load_nodes( + data=df, + time="time", + id="id", + properties=["val_i32"], + schema=[("val_i32", PropType.i64())], + ) + n_prop_dtype = g.node(10).properties.get_dtype_of("val_i32") + assert n_prop_dtype == PropType.i64() + del g, n_prop_dtype + + # Cast the val_i32 column to I64 using PyArrow int64 DataType + g = Graph() + g.load_nodes( + data=df, + time="time", + id="id", + properties=["val_i32"], + schema=[("val_i32", pa.int64())], + ) + n_prop_dtype = g.node(10).properties.get_dtype_of("val_i32") + assert n_prop_dtype == PropType.i64() + + +def test_list_schema_casting(): + table = pa.Table.from_pydict( + { + "time": pa.array([1, 2, 3], type=pa.int64()), + "id": pa.array([10, 20, 30], type=pa.int64()), + "val_list_i32": pa.array( + [[1, 2], [3, 4], [5, 6]], + type=pa.list_(pa.int32()), + ), + } + ) + + # No casting + g = Graph() + g.load_nodes(data=table, time="time", id="id", properties=["val_list_i32"]) + n_prop_dtype = g.node(10).properties.get_dtype_of("val_list_i32") + assert n_prop_dtype == PropType.list(PropType.i32()) + del g, n_prop_dtype + + # Cast the val_list_i32 column to I64 using PropType.list(PropType.i64()) + g = Graph() + g.load_nodes( + data=table, + time="time", + id="id", + properties=["val_list_i32"], + schema=[("val_list_i32", PropType.list(PropType.i64()))], + ) + n_prop_dtype = g.node(10).properties.get_dtype_of("val_list_i32") + assert n_prop_dtype == PropType.list(PropType.i64()) + del g, n_prop_dtype + + # Cast the val_list_i32 column to I64 using PyArrow list DataType + g = Graph() + g.load_nodes( + data=table, + time="time", + id="id", + properties=["val_list_i32"], + schema=[("val_list_i32", pa.list_(pa.int64()))], + ) + n_prop_dtype = g.node(10).properties.get_dtype_of("val_list_i32") + assert n_prop_dtype == PropType.list(PropType.i64()) + +def test_schema_casting_dict(): + # time/id as regular ints (I64), value column as explicit int32 + df = pd.DataFrame( + { + "time": pd.Series([1, 2, 3], dtype="int64"), + "id": pd.Series([10, 20, 30], dtype="int64"), + "val_i32": pd.Series([1, 2, 3], dtype="int32"), + } + ) + + # schema casting as list + g_list = Graph() + g_list.load_nodes( + data=df, + time="time", + id="id", + properties=["val_i32"], + schema=[("val_i32", PropType.i64())], + ) + dtype_list = [g_list.node(10).properties.get_dtype_of("val_i32")] + del g_list + + # schema casting as dict using PropType + g_dict_proptype = Graph() + g_dict_proptype.load_nodes( + data=df, + time="time", + id="id", + properties=["val_i32"], + schema={"val_i32": PropType.i64()}, + ) + dtype_list.append(g_dict_proptype.node(10).properties.get_dtype_of("val_i32")) + del g_dict_proptype + + # schema casting as dict using pyarrow DataType + g_dict_pa = Graph() + g_dict_pa.load_nodes( + data=df, + time="time", + id="id", + properties=["val_i32"], + schema={"val_i32": pa.int64()}, + ) + dtype_list.append(g_dict_pa.node(10).properties.get_dtype_of("val_i32")) + del g_dict_pa + + for dtype in dtype_list: + assert dtype == PropType.i64() + +def test_nested_schema_casting(): + # types to make sure the table is built properly and test the types + struct_type_i32 = pa.struct( + [ + pa.field("a", pa.int32()), + pa.field("b", pa.int32()), + ] + ) + struct_type_i64 = pa.struct( + [ + pa.field("a", pa.int64()), + pa.field("b", pa.int64()), + ] + ) + + table = pa.Table.from_pydict( + { + "time": pa.array([1, 2, 3], type=pa.int64()), + "id": pa.array([10, 20, 30], type=pa.int64()), + "val_struct": pa.array( + [ + {"a": 1, "b": 10}, + {"a": 2, "b": 20}, + {"a": 3, "b": 30}, + ], + type=struct_type_i32, + ), + } + ) + + # no casting + g = Graph() + g.load_nodes( + data=table, + time="time", + id="id", + properties=["val_struct"], + ) + d_type_no_cast = g.node(10).properties.get_dtype_of("val_struct") + del g + + assert d_type_no_cast == struct_type_i32 + assert d_type_no_cast == PropType.map({"a": PropType.i32(), "b": PropType.i32()}) + # also check PropType.map of pyarrow types, mix and match + assert d_type_no_cast == PropType.map({"a": pa.int32(), "b": pa.int32()}) + + # schema is a PropType.map(...) inside a dict + g = Graph() + g.load_nodes( + data=table, + time="time", + id="id", + properties=["val_struct"], + schema={ + "val_struct": PropType.map( + { + "a": PropType.i64(), + "b": PropType.i64(), + } + ) + }, + ) + dtype_proptype = g.node(10).properties.get_dtype_of("val_struct") + del g + + assert dtype_proptype == struct_type_i64 + assert dtype_proptype == PropType.map({"a": PropType.i64(), "b": PropType.i64()}) + # also check PropType.map of pyarrow types, mix and match + assert dtype_proptype == PropType.map({"a": pa.int64(), "b": pa.int64()}) + + # schema is a PropType.map(...) with mixed pyarrow and PropType types + g = Graph() + g.load_nodes( + data=table, + time="time", + id="id", + properties=["val_struct"], + schema={ + "val_struct": PropType.map( + { + "a": pa.int64(), + "b": pa.int64(), + } + ) + }, + ) + dtype_mixed = g.node(10).properties.get_dtype_of("val_struct") + del g + + assert dtype_mixed == struct_type_i64 + assert dtype_mixed == PropType.map({"a": PropType.i64(), "b": PropType.i64()}) + # also check PropType.map of pyarrow types, mix and match + assert dtype_mixed == PropType.map({"a": pa.int64(), "b": pa.int64()}) + + # schema is defined using pyarrow + g = Graph() + g.load_nodes( + data=table, + time="time", + id="id", + properties=["val_struct"], + schema={"val_struct": struct_type_i64}, + ) + dtype_pyarrow = g.node(10).properties.get_dtype_of("val_struct") + del g + + assert dtype_pyarrow == dtype_proptype + assert dtype_pyarrow == struct_type_i64 + assert dtype_pyarrow == PropType.map({"a": PropType.i64(), "b": PropType.i64()}) + # also check PropType.map of pyarrow types, mix and match + assert dtype_pyarrow == PropType.map({"a": pa.int64(), "b": pa.int64()}) + +def _btc_root() -> Path: + return Path(__file__).parent / "data" / "btc_dataset" + +def _csv_expected_earliest_dt(paths: list[Path]): + df = pd.concat([pd.read_csv(p) for p in paths], ignore_index=True) + return pd.to_datetime(df["block_timestamp"], utc=True).min().to_pydatetime() + +def _parquet_expected_earliest_dt(paths: list[Path]): + df = pd.concat([pd.read_parquet(p) for p in paths], ignore_index=True) + return pd.to_datetime(df["block_timestamp"], utc=True).min().to_pydatetime() + +@pytest.mark.parametrize("schema_value", [PropType.datetime(), pa.timestamp("ms", tz="UTC")]) +def test_casting_btc_csv_file(schema_value): + csv_path = _btc_root() / "flattened_data.csv" + expected_earliest = _csv_expected_earliest_dt([csv_path]) + + # Pick a node id from the file + df = pd.read_csv(csv_path) + some_node_id = df["inputs_address"].iloc[0] + + g = Graph() + g.load_nodes( + data=str(csv_path), + time="block_timestamp", + id="inputs_address", + properties=["block_timestamp"], + schema={"block_timestamp": schema_value}, + ) + + dtype = g.node(some_node_id).properties.get_dtype_of("block_timestamp") + assert dtype == PropType.datetime() + assert dtype == pa.timestamp("ms", tz="UTC") + assert g.earliest_time.dt == expected_earliest + +@pytest.mark.parametrize("schema_value", [PropType.datetime(), pa.timestamp("ms", tz="UTC")]) +def test_casting_btc_csv_directory(schema_value): + csv_dir = _btc_root() / "csv_directory" + csv_paths = sorted(p for p in csv_dir.iterdir() if p.suffix == ".csv") + expected_earliest = _csv_expected_earliest_dt(csv_paths) + + df0 = pd.read_csv(csv_paths[0]) + some_node_id = df0["inputs_address"].iloc[0] + + g = Graph() + g.load_nodes( + data=str(csv_dir), + time="block_timestamp", + id="inputs_address", + properties=["block_timestamp"], + schema={"block_timestamp": schema_value}, + ) + + dtype = g.node(some_node_id).properties.get_dtype_of("block_timestamp") + assert dtype == PropType.datetime() + assert dtype == pa.timestamp("ms", tz="UTC") + assert g.earliest_time.dt == expected_earliest + +@pytest.mark.parametrize("schema_value", [PropType.datetime(), pa.timestamp("ms", tz="UTC")]) +def test_casting_btc_parquet_file(schema_value): + pq_path = _btc_root() / "flattened_data.parquet" + expected_earliest = _parquet_expected_earliest_dt([pq_path]) + + df = pd.read_parquet(pq_path) + some_node_id = df["inputs_address"].iloc[0] + + g = Graph() + g.load_nodes( + data=str(pq_path), + time="block_timestamp", + id="inputs_address", + properties=["block_timestamp"], + schema={"block_timestamp": schema_value}, + ) + + dtype = g.node(some_node_id).properties.get_dtype_of("block_timestamp") + assert dtype == PropType.datetime() + assert dtype == pa.timestamp("ms", tz="UTC") + assert g.earliest_time.dt == expected_earliest + +@pytest.mark.parametrize("schema_value", [PropType.datetime(), pa.timestamp("ms", tz="UTC")]) +def test_casting_btc_parquet_directory(schema_value): + pq_dir = _btc_root() / "parquet_directory" + pq_paths = sorted(p for p in pq_dir.iterdir() if p.suffix == ".parquet") + expected_earliest = _parquet_expected_earliest_dt(pq_paths) + + df0 = pd.read_parquet(pq_paths[0]) + some_node_id = df0["inputs_address"].iloc[0] + + g = Graph() + g.load_nodes( + data=str(pq_dir), + time="block_timestamp", + id="inputs_address", + properties=["block_timestamp"], + schema={"block_timestamp": schema_value}, + ) + + dtype = g.node(some_node_id).properties.get_dtype_of("block_timestamp") + assert dtype == PropType.datetime() + assert dtype == pa.timestamp("ms", tz="UTC") + assert g.earliest_time.dt == expected_earliest + +@pytest.mark.parametrize("schema_value", [PropType.datetime(), pa.timestamp("ms", tz="UTC")]) +def test_casting_btc_mixed_directory(schema_value): + mixed_dir = _btc_root() / "mixed_directory" + csv_paths = sorted(p for p in mixed_dir.iterdir() if p.suffix == ".csv") + pq_paths = sorted(p for p in mixed_dir.iterdir() if p.suffix == ".parquet") + + # Compute expected earliest across both formats + expected_csv = _csv_expected_earliest_dt(csv_paths) + expected_pq = _parquet_expected_earliest_dt(pq_paths) + expected_earliest = min(expected_csv, expected_pq) + + # Use an id from one of the files + some_node_id = pd.read_csv(csv_paths[0])["inputs_address"].iloc[0] + + g = Graph() + g.load_nodes( + data=str(mixed_dir), + time="block_timestamp", + id="inputs_address", + properties=["block_timestamp"], + schema={"block_timestamp": schema_value}, + ) + + dtype = g.node(some_node_id).properties.get_dtype_of("block_timestamp") + assert dtype == PropType.datetime() + assert dtype == pa.timestamp("ms", tz="UTC") + assert g.earliest_time.dt == expected_earliest + +def test_malformed_files_and_directory(): + empty_dir = _btc_root() / "empty_directory" + with pytest.raises(Exception, match="Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"): + g = Graph() + g.load_nodes( + data=empty_dir, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + malformed_dir = _btc_root() / "malformed_files" + for malformed_file in malformed_dir.iterdir(): + # couldn't create a parquet file malformed with an extra column in a row + if "extra_field" in malformed_file.name: + with pytest.raises(Exception, match="Encountered unequal lengths between records"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "impossible_date" in malformed_file.name: + with pytest.raises(Exception) as e: + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + assert ("Error during parsing of time string" in str(e.value)) or ("Error parsing timestamp from '2025-99-99 99:99:99'" in str(e.value)) + + # csv file raises exception but parquet file doesn't + if "missing_field.csv" in malformed_file.name: + with pytest.raises(Exception, match="Encountered unequal lengths between records on CSV file"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "missing_field.parquet" in malformed_file.name: + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + n = g.node("bc1qabc") + assert n.history[0] == "2025-11-10 00:28:09" + assert n.properties.get("outputs_address") is None + + if "missing_id_col" in malformed_file.name: + with pytest.raises(Exception, match="columns are not present within the dataframe: inputs_address"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "missing_prop_col" in malformed_file.name: + with pytest.raises(Exception, match="columns are not present within the dataframe: outputs_address"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "missing_timestamp_col" in malformed_file.name: + with pytest.raises(Exception, match="columns are not present within the dataframe: block_timestamp"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "null_id.csv" in malformed_file.name: + with pytest.raises(Exception, match="Null not supported as node id"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + # in parquet, null value gets interpreted as Float64 + if "null_id.parquet" in malformed_file.name: + with pytest.raises(Exception, match="Float64 not supported as node id type"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "null_timestamp.csv" in malformed_file.name: + with pytest.raises(Exception, match="Null not supported for time column"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "null_timestamp.parquet" in malformed_file.name: + with pytest.raises(Exception, match="Missing value for timestamp"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + if "out_of_range_timestamp" in malformed_file.name: + with pytest.raises(Exception, match="'999999999999999999999' is not a valid datetime"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + + # not applicable to csv + if "semicolon_delimiter" in malformed_file.name: + with pytest.raises(Exception, match="the following columns are not present within the dataframe"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + ) + g = Graph() + + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["outputs_address"], + csv_options={"delimiter": ';'} + ) + assert g.node("bc1qabc").history[0] == "2025-11-10 00:28:09" + + if "timestamp_malformed" in malformed_file.name: + with pytest.raises(Exception, match="Missing value for timestamp"): + g = Graph() + g.load_nodes( + data=malformed_file, + time="block_timestamp", + id="inputs_address", + properties=["block_timestamp"], + schema={"block_timestamp": pa.timestamp("ms", tz="UTC")} + ) + + if fpd: import pandas @@ -55,7 +736,7 @@ def test_load_edges_from_fireducks_df(graph_type): ) g = graph_type() - g.load_edges_from_df( + g.load_edges( data=df, time="time", src="src", dst="dst", properties=["value"] ) assert [(1, 1, 2, 10.0), (2, 2, 3, 20.0), (3, 3, 4, 30.0)] == _collect_edges(g) @@ -80,13 +761,13 @@ def test_fireducks_matches_pandas_for_same_edges(graph_type): ) g_fireducks = graph_type() - g_fireducks.load_edges_from_df( + g_fireducks.load_edges( data=df_fireducks, time="time", src="src", dst="dst", properties=["value"] ) g_pandas = graph_type() - g_pandas.load_edges_from_pandas( - df=df_pandas, time="time", src="src", dst="dst", properties=["value"] + g_pandas.load_edges( + data=df_pandas, time="time", src="src", dst="dst", properties=["value"] ) expected = [(1, 1, 2, 10.0), (2, 2, 3, 20.0), (3, 3, 4, 30.0)] diff --git a/raphtory-api/Cargo.toml b/raphtory-api/Cargo.toml index e1c4b738c7..9c3fafaf45 100644 --- a/raphtory-api/Cargo.toml +++ b/raphtory-api/Cargo.toml @@ -49,7 +49,7 @@ proptest.workspace = true default = [] # Enables generating the pyo3 python bindings python = [ - "dep:pyo3", "dep:pyo3-arrow", "dep:display-error-chain" + "dep:pyo3", "dep:pyo3-arrow", "dep:display-error-chain", "dep:arrow-schema" ] storage = [ diff --git a/raphtory-api/src/core/entities/properties/prop/prop_type.rs b/raphtory-api/src/core/entities/properties/prop/prop_type.rs index 8a72245bf7..9e029f9db2 100644 --- a/raphtory-api/src/core/entities/properties/prop/prop_type.rs +++ b/raphtory-api/src/core/entities/properties/prop/prop_type.rs @@ -1,3 +1,5 @@ +#[cfg(any(feature = "arrow", feature = "storage", feature = "python"))] +use arrow_schema::DataType; use serde::{Deserialize, Serialize}; use std::{ collections::HashMap, @@ -135,6 +137,54 @@ impl PropType { } } +#[cfg(any(feature = "arrow", feature = "storage", feature = "python"))] +pub fn data_type_as_prop_type(dt: &DataType) -> Result { + match dt { + DataType::Boolean => Ok(PropType::Bool), + DataType::Int32 => Ok(PropType::I32), + DataType::Int64 => Ok(PropType::I64), + DataType::UInt8 => Ok(PropType::U8), + DataType::UInt16 => Ok(PropType::U16), + DataType::UInt32 => Ok(PropType::U32), + DataType::UInt64 => Ok(PropType::U64), + DataType::Float32 => Ok(PropType::F32), + DataType::Float64 => Ok(PropType::F64), + DataType::Utf8 => Ok(PropType::Str), + DataType::LargeUtf8 => Ok(PropType::Str), + DataType::Utf8View => Ok(PropType::Str), + DataType::Struct(fields) => Ok(PropType::map(fields.iter().filter_map(|f| { + data_type_as_prop_type(f.data_type()) + .ok() + .map(move |pt| (f.name(), pt)) + }))), + DataType::List(v) => Ok(PropType::List(Box::new(data_type_as_prop_type( + v.data_type(), + )?))), + DataType::FixedSizeList(v, _) => Ok(PropType::List(Box::new(data_type_as_prop_type( + v.data_type(), + )?))), + DataType::LargeList(v) => Ok(PropType::List(Box::new(data_type_as_prop_type( + v.data_type(), + )?))), + DataType::Timestamp(_, v) => match v { + None => Ok(PropType::NDTime), + Some(_) => Ok(PropType::DTime), + }, + DataType::Date32 => Ok(PropType::NDTime), + DataType::Date64 => Ok(PropType::NDTime), + DataType::Decimal128(precision, scale) if *precision <= 38 => Ok(PropType::Decimal { + scale: *scale as i64, + }), + DataType::Null => Ok(PropType::Empty), + _ => Err(InvalidPropertyTypeErr(dt.clone())), + } +} + +#[cfg(any(feature = "arrow", feature = "storage", feature = "python"))] +#[derive(thiserror::Error, Debug)] +#[error("{0:?} not supported as property type")] +pub struct InvalidPropertyTypeErr(pub DataType); + #[cfg(any(feature = "arrow", feature = "storage"))] mod arrow { use crate::core::entities::properties::prop::PropType; diff --git a/raphtory-api/src/python/mod.rs b/raphtory-api/src/python/mod.rs index ff14487074..6e9ea14307 100644 --- a/raphtory-api/src/python/mod.rs +++ b/raphtory-api/src/python/mod.rs @@ -2,5 +2,5 @@ mod arcstr; mod direction; pub mod error; mod gid; -mod prop; +pub mod prop; pub mod timeindex; diff --git a/raphtory-api/src/python/prop.rs b/raphtory-api/src/python/prop.rs index a6875b2876..24f5ac676a 100644 --- a/raphtory-api/src/python/prop.rs +++ b/raphtory-api/src/python/prop.rs @@ -1,13 +1,15 @@ -use crate::core::entities::properties::prop::Prop; +use crate::core::entities::properties::prop::{data_type_as_prop_type, Prop, PropType}; use bigdecimal::BigDecimal; use pyo3::{ exceptions::PyTypeError, prelude::*, + pybacked::PyBackedStr, sync::GILOnceCell, types::{PyBool, PyType}, Bound, FromPyObject, IntoPyObject, IntoPyObjectExt, Py, PyAny, PyErr, PyResult, Python, }; -use std::{ops::Deref, str::FromStr, sync::Arc}; +use pyo3_arrow::PyDataType; +use std::{collections::HashMap, ops::Deref, str::FromStr, sync::Arc}; #[cfg(feature = "arrow")] mod array_ext { @@ -126,3 +128,135 @@ impl<'source> FromPyObject<'source> for Prop { ))) } } + +#[pyclass(name = "PropType", frozen, module = "raphtory")] +pub struct PyPropType(pub PropType); + +#[pymethods] +impl PyPropType { + #[staticmethod] + pub fn u8() -> PropType { + PropType::U8 + } + + #[staticmethod] + pub fn u16() -> PropType { + PropType::U16 + } + + #[staticmethod] + pub fn u32() -> PropType { + PropType::U32 + } + + #[staticmethod] + pub fn u64() -> PropType { + PropType::U64 + } + + #[staticmethod] + pub fn i32() -> PropType { + PropType::I32 + } + + #[staticmethod] + pub fn i64() -> PropType { + PropType::I64 + } + + #[staticmethod] + pub fn f32() -> PropType { + PropType::F32 + } + + #[staticmethod] + pub fn f64() -> PropType { + PropType::F64 + } + + #[staticmethod] + pub fn str() -> PropType { + PropType::Str + } + + #[staticmethod] + pub fn bool() -> PropType { + PropType::Bool + } + + #[staticmethod] + pub fn naive_datetime() -> PropType { + PropType::NDTime + } + + #[staticmethod] + pub fn datetime() -> PropType { + PropType::DTime + } + + #[staticmethod] + pub fn list(p: PropType) -> PropType { + PropType::List(Box::new(p)) + } + + #[staticmethod] + pub fn map(hash_map: HashMap) -> PropType { + PropType::Map(Arc::new(hash_map)) + } + + #[staticmethod] + pub fn array(p: PropType) -> PropType { + PropType::Array(Box::new(p)) + } + + fn __repr__(&self) -> String { + format!("PropType.{}", self.0) + } + + fn __eq__(&self, other: PropType) -> bool { + self.0 == other + } +} + +impl<'py> IntoPyObject<'py> for PropType { + type Target = PyPropType; + type Output = Bound<'py, Self::Target>; + type Error = >::Error; + + fn into_pyobject(self, py: Python<'py>) -> Result { + PyPropType(self).into_pyobject(py) + } +} + +impl<'source> FromPyObject<'source> for PropType { + fn extract_bound(ob: &Bound<'source, PyAny>) -> PyResult { + if let Ok(prop_type) = ob.downcast::() { + Ok(prop_type.get().0.clone()) + } else if let Ok(prop_type_str) = ob.extract::() { + match prop_type_str.deref().to_ascii_lowercase().as_str() { + "i64" | "int64" | "int" => Ok(PropType::I64), + "i32" | "int32" => Ok(PropType::I32), + "u64" | "uint64" => Ok(PropType::U64), + "u32" | "uint32" => Ok(PropType::I32), + "u16" | "uint16" => Ok(PropType::U16), + "u8" | "uint8" => Ok(PropType::U8), + "f64" | "float64" | "float" | "double" => Ok(PropType::F64), + "f32" | "float32" => Ok(PropType::F32), + "bool" | "boolean" => Ok(PropType::Bool), + "str" | "string" | "utf8" => Ok(PropType::Str), + "ndtime" | "naivedatetime" | "datetime" => Ok(PropType::NDTime), + "dtime" | "datetimetz" => Ok(PropType::DTime), + other => Err(PyTypeError::new_err(format!( + "Unknown type name '{other:?}'" + ))), + } + } else if let Ok(py_datatype) = ob.extract::() { + data_type_as_prop_type(&py_datatype.into_inner()) + .map_err(|e| PyTypeError::new_err(format!("Unsupported Arrow DataType {:?}", e.0))) + } else { + Err(PyTypeError::new_err( + "PropType must be a string or an instance of itself.", + )) + } + } +} diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index 66a566b191..d24dbe6fc8 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -69,6 +69,7 @@ prost-types = { workspace = true, optional = true } # arrow otional dependencies parquet = { workspace = true, optional = true } arrow-json = { workspace = true, optional = true } +arrow-csv = { workspace = true, optional = true} #arrow-array = { workspace = true, features = ["chrono-tz"], optional = true } #arrow-buffer = { workspace = true, optional = true } #arrow-cast = { workspace = true, optional = true } @@ -175,6 +176,7 @@ arrow = [ "raphtory-core/arrow", "dep:parquet", "dep:arrow-json", + "dep:arrow-csv", "dep:arrow", ] diff --git a/raphtory/src/errors.rs b/raphtory/src/errors.rs index a52c0fd8ef..c2859f50f9 100644 --- a/raphtory/src/errors.rs +++ b/raphtory/src/errors.rs @@ -29,7 +29,10 @@ use pometry_storage::RAError; use { arrow::{datatypes::DataType, error::ArrowError}, parquet::errors::ParquetError, - raphtory_api::core::entities::{properties::prop::DeserialisationError, GidType, VID}, + raphtory_api::core::entities::{ + properties::prop::{DeserialisationError, InvalidPropertyTypeErr}, + GidType, VID, + }, }; #[cfg(feature = "python")] @@ -501,3 +504,17 @@ impl From for io::Error { io::Error::other(error) } } + +#[cfg(feature = "arrow")] +impl From for LoadError { + fn from(value: InvalidPropertyTypeErr) -> Self { + LoadError::InvalidPropertyType(value.0) + } +} + +#[cfg(feature = "arrow")] +impl From for GraphError { + fn from(value: InvalidPropertyTypeErr) -> Self { + GraphError::from(LoadError::from(value)) + } +} diff --git a/raphtory/src/io/arrow/prop_handler.rs b/raphtory/src/io/arrow/prop_handler.rs index 7e7c2a6667..9537fff3c2 100644 --- a/raphtory/src/io/arrow/prop_handler.rs +++ b/raphtory/src/io/arrow/prop_handler.rs @@ -1,8 +1,4 @@ -use crate::{ - errors::{GraphError, LoadError}, - io::arrow::dataframe::DFChunk, - prelude::Prop, -}; +use crate::{errors::GraphError, io::arrow::dataframe::DFChunk, prelude::Prop}; use arrow::{ array::{ Array, ArrayRef, ArrowPrimitiveType, AsArray, BooleanArray, Decimal128Array, @@ -20,7 +16,7 @@ use arrow::{ use bigdecimal::BigDecimal; use chrono::{DateTime, Utc}; use raphtory_api::core::{ - entities::properties::prop::{IntoPropList, PropType}, + entities::properties::prop::{data_type_as_prop_type, IntoPropList, PropType}, storage::{arc_str::ArcStr, dict_mapper::MaybeNew}, }; use rayon::prelude::*; @@ -62,8 +58,8 @@ where { let dtypes = indices .iter() - .map(|idx| data_type_as_prop_type(df.chunk[*idx].data_type())) - .collect::, _>>()?; + .map(|idx| data_type_as_prop_type(df.chunk[*idx].data_type()).map_err(Into::into)) + .collect::, GraphError>>()?; let cols = indices .iter() .map(|idx| lift_property_col(&df.chunk[*idx])) @@ -241,48 +237,6 @@ fn arr_as_prop(arr: ArrayRef) -> Prop { } } -fn data_type_as_prop_type(dt: &DataType) -> Result { - match dt { - DataType::Boolean => Ok(PropType::Bool), - DataType::Int32 => Ok(PropType::I32), - DataType::Int64 => Ok(PropType::I64), - DataType::UInt8 => Ok(PropType::U8), - DataType::UInt16 => Ok(PropType::U16), - DataType::UInt32 => Ok(PropType::U32), - DataType::UInt64 => Ok(PropType::U64), - DataType::Float32 => Ok(PropType::F32), - DataType::Float64 => Ok(PropType::F64), - DataType::Utf8 => Ok(PropType::Str), - DataType::LargeUtf8 => Ok(PropType::Str), - DataType::Utf8View => Ok(PropType::Str), - DataType::Struct(fields) => Ok(PropType::map(fields.iter().filter_map(|f| { - data_type_as_prop_type(f.data_type()) - .ok() - .map(move |pt| (f.name(), pt)) - }))), - DataType::List(v) => Ok(PropType::List(Box::new(data_type_as_prop_type( - v.data_type(), - )?))), - DataType::FixedSizeList(v, _) => Ok(PropType::List(Box::new(data_type_as_prop_type( - v.data_type(), - )?))), - DataType::LargeList(v) => Ok(PropType::List(Box::new(data_type_as_prop_type( - v.data_type(), - )?))), - DataType::Timestamp(_, v) => match v { - None => Ok(PropType::NDTime), - Some(_) => Ok(PropType::DTime), - }, - DataType::Date32 => Ok(PropType::NDTime), - DataType::Date64 => Ok(PropType::NDTime), - DataType::Decimal128(precision, scale) if *precision <= 38 => Ok(PropType::Decimal { - scale: *scale as i64, - }), - DataType::Null => Ok(PropType::Empty), - _ => Err(LoadError::InvalidPropertyType(dt.clone()).into()), - } -} - trait PropCol: Send + Sync { fn get(&self, i: usize) -> Option; } diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index 1ade06f350..dacd949958 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -5,17 +5,35 @@ use crate::{ prelude::{AdditionOps, DeletionOps, PropertyAdditionOps}, serialise::incremental::InternalCache, }; +use arrow::{ + array::{Array, RecordBatch, StructArray}, + compute::cast, + datatypes::{DataType, Field, Fields}, +}; use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; -use raphtory_api::core::entities::properties::prop::Prop; +#[cfg(feature = "storage")] +use pometry_storage::RAError; +use raphtory_api::core::entities::properties::prop::{arrow_dtype_from_prop_type, Prop, PropType}; use std::{ collections::HashMap, + ffi::OsStr, fs, fs::File, path::{Path, PathBuf}, + sync::Arc, }; -#[cfg(feature = "storage")] -use {arrow::array::StructArray, pometry_storage::RAError}; +pub(crate) fn is_parquet_path(path: &PathBuf) -> Result { + if path.is_dir() { + Ok(fs::read_dir(&path)?.any(|entry| { + entry.map_or(false, |e| { + e.path().extension().and_then(OsStr::to_str) == Some("parquet") + }) + })) + } else { + Ok(path.extension().and_then(OsStr::to_str) == Some("parquet")) + } +} pub fn load_nodes_from_parquet< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, @@ -30,6 +48,7 @@ pub fn load_nodes_from_parquet< metadata: &[&str], shared_metadata: Option<&HashMap>, batch_size: Option, + schema: Option>>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id, time]; cols_to_check.extend_from_slice(properties); @@ -39,7 +58,12 @@ pub fn load_nodes_from_parquet< } for path in get_parquet_file_paths(parquet_path)? { - let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check), batch_size)?; + let df_view = process_parquet_file_to_df( + path.as_path(), + Some(&cols_to_check), + batch_size, + schema.clone(), + )?; df_view.check_cols_exist(&cols_to_check)?; load_nodes_from_df( df_view, @@ -52,7 +76,7 @@ pub fn load_nodes_from_parquet< node_type_col, graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(e.to_string()))?; } Ok(()) @@ -72,6 +96,7 @@ pub fn load_edges_from_parquet< layer: Option<&str>, layer_col: Option<&str>, batch_size: Option, + schema: Option>>, ) -> Result<(), GraphError> { let parquet_path = parquet_path.as_ref(); let mut cols_to_check = vec![src, dst, time]; @@ -107,9 +132,13 @@ pub fn load_edges_from_parquet< let all_df_view = get_parquet_file_paths(parquet_path)? .into_iter() .flat_map(|file| { - let df_view = - process_parquet_file_to_df(file.as_path(), Some(&cols_to_check), batch_size) - .expect("Failed to process Parquet file"); + let df_view = process_parquet_file_to_df( + file.as_path(), + Some(&cols_to_check), + batch_size, + schema.clone(), + ) + .expect("Failed to process Parquet file"); df_view.chunks }); @@ -131,12 +160,12 @@ pub fn load_edges_from_parquet< layer_col, graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(e.to_string()))?; Ok(()) } -pub fn load_node_props_from_parquet< +pub fn load_node_metadata_from_parquet< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, >( graph: &G, @@ -147,6 +176,7 @@ pub fn load_node_props_from_parquet< metadata_properties: &[&str], shared_metadata: Option<&HashMap>, batch_size: Option, + schema: Option>>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id]; cols_to_check.extend_from_slice(metadata_properties); @@ -156,7 +186,12 @@ pub fn load_node_props_from_parquet< } for path in get_parquet_file_paths(parquet_path)? { - let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check), batch_size)?; + let df_view = process_parquet_file_to_df( + path.as_path(), + Some(&cols_to_check), + batch_size, + schema.clone(), + )?; df_view.check_cols_exist(&cols_to_check)?; load_node_props_from_df( @@ -168,13 +203,13 @@ pub fn load_node_props_from_parquet< shared_metadata, graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(e.to_string()))?; } Ok(()) } -pub fn load_edge_props_from_parquet< +pub fn load_edge_metadata_from_parquet< G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, >( graph: &G, @@ -186,6 +221,7 @@ pub fn load_edge_props_from_parquet< layer: Option<&str>, layer_col: Option<&str>, batch_size: Option, + schema: Option>>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst]; if let Some(ref layer_col) = layer_col { @@ -195,7 +231,12 @@ pub fn load_edge_props_from_parquet< cols_to_check.extend_from_slice(metadata); for path in get_parquet_file_paths(parquet_path)? { - let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check), batch_size)?; + let df_view = process_parquet_file_to_df( + path.as_path(), + Some(&cols_to_check), + batch_size, + schema.clone(), + )?; df_view.check_cols_exist(&cols_to_check)?; load_edges_props_from_df( df_view, @@ -207,7 +248,7 @@ pub fn load_edge_props_from_parquet< layer_col, graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(e.to_string()))?; } Ok(()) @@ -224,6 +265,7 @@ pub fn load_edge_deletions_from_parquet< layer: Option<&str>, layer_col: Option<&str>, batch_size: Option, + schema: Option>>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst, time]; if let Some(ref layer_col) = layer_col { @@ -231,10 +273,15 @@ pub fn load_edge_deletions_from_parquet< } for path in get_parquet_file_paths(parquet_path)? { - let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check), batch_size)?; + let df_view = process_parquet_file_to_df( + path.as_path(), + Some(&cols_to_check), + batch_size, + schema.clone(), + )?; df_view.check_cols_exist(&cols_to_check)?; load_edge_deletions_from_df(df_view, time, src, dst, layer, layer_col, graph) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(e.to_string()))?; } Ok(()) } @@ -246,16 +293,22 @@ pub fn load_graph_props_from_parquet, + schema: Option>>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![time]; cols_to_check.extend_from_slice(properties); cols_to_check.extend_from_slice(metadata); for path in get_parquet_file_paths(parquet_path)? { - let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check), batch_size)?; + let df_view = process_parquet_file_to_df( + path.as_path(), + Some(&cols_to_check), + batch_size, + schema.clone(), + )?; df_view.check_cols_exist(&cols_to_check)?; load_graph_props_from_df(df_view, time, Some(properties), Some(metadata), graph) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(e.to_string()))?; } Ok(()) @@ -265,6 +318,7 @@ pub(crate) fn process_parquet_file_to_df( parquet_file_path: &Path, col_names: Option<&[&str]>, batch_size: Option, + schema: Option>>, ) -> Result>>, GraphError> { let (names, chunks, num_rows) = read_parquet_file(parquet_file_path, col_names)?; @@ -278,12 +332,20 @@ pub(crate) fn process_parquet_file_to_df( Some(batch_size) => chunks.with_batch_size(batch_size), }; - let chunks = chunks.build()?.into_iter().map(move |result| { - result - .map(|r| DFChunk { - chunk: r.columns().to_vec(), + let chunks = chunks.build()?.into_iter().map(move |result| match result { + Ok(r) => { + let casted_batch = if let Some(schema) = schema.as_deref() { + cast_columns(r, schema)? + } else { + r + }; + Ok(DFChunk { + chunk: casted_batch.columns().to_vec(), }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to process Parquet file: {e:?}"))) + } + Err(e) => Err(GraphError::LoadFailure(format!( + "Failed to process Parquet file: {e:?}" + ))), }); Ok(DFView { @@ -336,6 +398,50 @@ pub fn get_parquet_file_paths(parquet_path: &Path) -> Result, Graph Ok(parquet_files) } +pub(crate) fn cast_columns( + batch: RecordBatch, + schema: &HashMap, +) -> Result { + let old_schema_ref = batch.schema(); + let old_fields = old_schema_ref.fields(); + + let mut target_fields: Vec = Vec::with_capacity(old_fields.len()); + + for field in old_fields.iter() { + if let Some(target_prop_type) = schema.get(field.name()) { + let target_dtype = arrow_dtype_from_prop_type(target_prop_type); + target_fields.push( + Field::new(field.name(), target_dtype, field.is_nullable()) + .with_metadata(field.metadata().clone()), + ); + } else { + // schema doesn't say anything about this column + target_fields.push(field.as_ref().clone()); + } + } + let struct_array = StructArray::from(batch); + let target_struct_type = DataType::Struct(Fields::from(target_fields)); + + // cast whole RecordBatch at once + let casted = cast(&struct_array, &target_struct_type).map_err(|e| { + GraphError::LoadFailure(format!( + "Failed to cast RecordBatch to target schema {:?}: {e}", + target_struct_type + )) + })?; + + let casted_struct = casted + .as_any() + .downcast_ref::() + .ok_or_else(|| { + GraphError::LoadFailure( + "Internal error: casting RecordBatch did not return StructArray".to_string(), + ) + })?; + + Ok(RecordBatch::from(casted_struct)) +} + #[cfg(feature = "storage")] pub fn read_struct_arrays( path: &Path, diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index b80f8aa2bb..f28e74418d 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -1,6 +1,5 @@ //! A columnar temporal graph. //! -use super::io::pandas_loaders::*; use crate::{ db::{ api::storage::graph::storage_ops::disk_storage::IntoGraph, @@ -9,7 +8,10 @@ use crate::{ errors::GraphError, io::parquet_loaders::read_struct_arrays, prelude::Graph, - python::{graph::graph::PyGraph, types::repr::StructReprBuilder}, + python::{ + graph::{graph::PyGraph, io::arrow_loaders::convert_py_prop_args}, + types::repr::StructReprBuilder, + }, }; use arrow::{array::StructArray, datatypes::Field}; use itertools::Itertools; diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 56fe6a49ba..5d7791d149 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -17,12 +17,12 @@ use crate::{ edge::PyEdge, graph_with_deletions::PyPersistentGraph, index::PyIndexSpec, - io::{ - arrow_loaders::{ - load_edge_metadata_from_arrow_c_stream, load_edges_from_arrow_c_stream, - load_node_metadata_from_arrow_c_stream, load_nodes_from_arrow_c_stream, - }, - pandas_loaders::*, + io::arrow_loaders::{ + convert_py_prop_args, convert_py_schema, is_csv_path, + load_edge_metadata_from_arrow_c_stream, load_edge_metadata_from_csv_path, + load_edges_from_arrow_c_stream, load_edges_from_csv_path, + load_node_metadata_from_arrow_c_stream, load_node_metadata_from_csv_path, + load_nodes_from_arrow_c_stream, load_nodes_from_csv_path, CsvReadOptions, }, node::PyNode, views::graph_view::PyGraphView, @@ -35,7 +35,7 @@ use crate::{ InternalStableDecode, StableEncode, }, }; -use pyo3::{prelude::*, pybacked::PyBackedStr, types::PyDict}; +use pyo3::{exceptions::PyValueError, prelude::*, pybacked::PyBackedStr, types::PyDict}; use raphtory_api::{ core::{entities::GID, storage::arc_str::ArcStr}, python::timeindex::EventTimeComponent, @@ -45,6 +45,7 @@ use std::{ collections::HashMap, fmt::{Debug, Formatter}, path::PathBuf, + sync::Arc, }; /// A temporal graph with event semantics. @@ -631,9 +632,10 @@ impl PyGraph { PyGraph::py_from_db_graph(self.graph.event_graph()) } - /// Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing the nodes. @@ -644,6 +646,8 @@ impl PyGraph { /// properties (List[str], optional): List of node property column names. Defaults to None. /// metadata (List[str], optional): List of node metadata column names. Defaults to None. /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -651,11 +655,11 @@ impl PyGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, time, id, node_type = None, node_type_col = None, properties = None, metadata= None, shared_metadata = None) + signature = (data, time, id, node_type = None, node_type_col = None, properties = None, metadata= None, shared_metadata = None, schema = None, csv_options = None) )] - fn load_nodes_from_df<'py>( + fn load_nodes( &self, - data: &Bound<'py, PyAny>, + data: &Bound, time: &str, id: &str, node_type: Option<&str>, @@ -663,129 +667,98 @@ impl PyGraph { properties: Option>, metadata: Option>, shared_metadata: Option>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_nodes_from_arrow_c_stream( - &self.graph, - data, - time, - id, - node_type, - node_type_col, - &properties, - &metadata, - shared_metadata.as_ref(), - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_nodes_from_arrow_c_stream( + &self.graph, + data, + time, + id, + node_type, + node_type_col, + &properties, + &metadata, + shared_metadata.as_ref(), + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load nodes from a Pandas DataFrame into the graph. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing the nodes. - /// time (str): The column name for the timestamps. - /// id (str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// properties (List[str], optional): List of node property column names. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (df, time, id, node_type = None, node_type_col = None, properties = None, metadata= None, shared_metadata = None) - )] - fn load_nodes_from_pandas<'py>( - &self, - df: &Bound<'py, PyAny>, - time: &str, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_nodes_from_pandas( - &self.graph, - df, - time, - id, - node_type, - node_type_col, - &properties, - &metadata, - shared_metadata.as_ref(), - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load nodes from a Parquet file into the graph. - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files containing the nodes - /// time (str): The column name for the timestamps. - /// id (str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// properties (List[str], optional): List of node property column names. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (parquet_path, time, id, node_type = None, node_type_col = None, properties = None, metadata = None, shared_metadata = None) - )] - fn load_nodes_from_parquet( - &self, - parquet_path: PathBuf, - time: &str, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_nodes_from_parquet( - &self.graph, - parquet_path.as_path(), - time, - id, - node_type, - node_type_col, - &properties, - &metadata, - shared_metadata.as_ref(), - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_nodes_from_parquet( + &self.graph, + path.as_path(), + time, + id, + node_type, + node_type_col, + &properties, + &metadata, + shared_metadata.as_ref(), + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_nodes_from_csv_path( + &self.graph, + &path, + time, + id, + node_type, + node_type_col, + &properties, + &metadata, + shared_metadata.as_ref(), + csv_options.as_ref(), + arced_schema, + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } - /// Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing the edges. /// time (str): The column name for the update timestamps. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. + /// src (str): The column name for the source node IDs. + /// dst (str): The column name for the destination node IDs. /// properties (List[str], optional): List of edge property column names. Defaults to None. /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. /// layer_col (str, optional): The edge layer column name in a dataframe. Cannot be used in combination with layer. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -793,9 +766,9 @@ impl PyGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None) + signature = (data, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None, schema = None, csv_options = None) )] - fn load_edges_from_df( + fn load_edges( &self, data: &Bound, time: &str, @@ -806,125 +779,88 @@ impl PyGraph { shared_metadata: Option>, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edges_from_arrow_c_stream( - &self.graph, - data, - time, - src, - dst, - &properties, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_edges_from_arrow_c_stream( + &self.graph, + data, + time, + src, + dst, + &properties, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load edges from a Pandas DataFrame into the graph. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing the edges. - /// time (str): The column name for the update timestamps. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. - /// properties (List[str], optional): List of edge property column names. Defaults to None. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (df, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None) - )] - fn load_edges_from_pandas( - &self, - df: &Bound, - time: &str, - src: &str, - dst: &str, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edges_from_pandas( - &self.graph, - df, - time, - src, - dst, - &properties, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load edges from a Parquet file into the graph. - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files path containing edges - /// time (str): The column name for the update timestamps. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. - /// properties (List[str], optional): List of edge property column names. Defaults to None. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (parquet_path, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None) - )] - fn load_edges_from_parquet( - &self, - parquet_path: PathBuf, - time: &str, - src: &str, - dst: &str, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edges_from_parquet( - &self.graph, - parquet_path.as_path(), - time, - src, - dst, - &properties, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_edges_from_parquet( + &self.graph, + &path, + time, + src, + dst, + &properties, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_edges_from_csv_path( + &self.graph, + &path, + time, + src, + dst, + &properties, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + csv_options.as_ref(), + arced_schema.clone(), + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } - /// Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing node information. @@ -933,6 +869,8 @@ impl PyGraph { /// node_type_col (str, optional): The node type column name in a dataframe. Cannot be used in combination with node_type. Defaults to None. /// metadata (List[str], optional): List of node metadata column names. Defaults to None. /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -940,9 +878,9 @@ impl PyGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, id, node_type = None, node_type_col = None, metadata = None, shared_metadata = None) + signature = (data, id, node_type = None, node_type_col = None, metadata = None, shared_metadata = None, schema = None, csv_options = None) )] - fn load_node_metadata_from_df( + fn load_node_metadata( &self, data: &Bound, id: &str, @@ -950,101 +888,78 @@ impl PyGraph { node_type_col: Option<&str>, metadata: Option>, shared_metadata: Option>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_node_metadata_from_arrow_c_stream( - &self.graph, - data, - id, - node_type, - node_type_col, - &metadata, - shared_metadata.as_ref(), - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_node_metadata_from_arrow_c_stream( + &self.graph, + data, + id, + node_type, + node_type_col, + &metadata, + shared_metadata.as_ref(), + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load node properties from a Pandas DataFrame. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing node information. - /// id(str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (df, id, node_type = None, node_type_col = None, metadata = None, shared_metadata = None) - )] - fn load_node_props_from_pandas( - &self, - df: &Bound, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_node_props_from_pandas( - &self.graph, - df, - id, - node_type, - node_type_col, - &metadata, - shared_metadata.as_ref(), - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load node properties from a parquet file. - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files path containing node information. - /// id(str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (parquet_path, id, node_type = None, node_type_col = None, metadata = None, shared_metadata= None) - )] - fn load_node_props_from_parquet( - &self, - parquet_path: PathBuf, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_node_props_from_parquet( - &self.graph, - parquet_path.as_path(), - id, - node_type, - node_type_col, - &metadata, - shared_metadata.as_ref(), - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_node_metadata_from_parquet( + &self.graph, + path.as_path(), + id, + node_type, + node_type_col, + &metadata, + shared_metadata.as_ref(), + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_node_metadata_from_csv_path( + &self.graph, + &path, + id, + node_type, + node_type_col, + &metadata, + shared_metadata.as_ref(), + csv_options.as_ref(), + arced_schema, + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } - /// Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing edge information. @@ -1054,6 +969,8 @@ impl PyGraph { /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. /// layer (str, optional): The edge layer name. Defaults to None. /// layer_col (str, optional): The edge layer column name in a dataframe. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -1061,9 +978,9 @@ impl PyGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None) + signature = (data, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None, schema = None, csv_options = None) )] - fn load_edge_metadata_from_df( + fn load_edge_metadata( &self, data: &Bound, src: &str, @@ -1072,103 +989,75 @@ impl PyGraph { shared_metadata: Option>, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edge_metadata_from_arrow_c_stream( - &self.graph, - data, - src, - dst, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_edge_metadata_from_arrow_c_stream( + &self.graph, + data, + src, + dst, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load edge properties from a Pandas DataFrame. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing edge information. - /// src (str): The column name for the source node. - /// dst (str): The column name for the destination node. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): The edge layer name. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (df, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None) - )] - fn load_edge_props_from_pandas( - &self, - df: &Bound, - src: &str, - dst: &str, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edge_props_from_pandas( - &self.graph, - df, - src, - dst, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load edge properties from parquet file - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files path containing edge information. - /// src (str): The column name for the source node. - /// dst (str): The column name for the destination node. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): The edge layer name. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3( - signature = (parquet_path, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None) - )] - fn load_edge_props_from_parquet( - &self, - parquet_path: PathBuf, - src: &str, - dst: &str, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edge_props_from_parquet( - &self.graph, - parquet_path.as_path(), - src, - dst, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_edge_metadata_from_parquet( + &self.graph, + path.as_path(), + src, + dst, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_edge_metadata_from_csv_path( + &self.graph, + &path, + src, + dst, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + csv_options.as_ref(), + arced_schema, + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } /// Create graph index diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index a37a588aed..03b1ca0386 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -5,10 +5,7 @@ //! create windows, and query the graph with a variety of algorithms. //! It is a wrapper around a set of shards, which are the actual graph data structures. //! In Python, this class wraps around the rust graph. -use super::{ - graph::{PyGraph, PyGraphEncoder}, - io::pandas_loaders::*, -}; +use super::graph::{PyGraph, PyGraphEncoder}; use crate::{ db::{ api::mutation::{AdditionOps, PropertyAdditionOps}, @@ -22,9 +19,12 @@ use crate::{ edge::PyEdge, index::PyIndexSpec, io::arrow_loaders::{ - load_edge_deletions_from_arrow_c_stream, load_edge_metadata_from_arrow_c_stream, - load_edges_from_arrow_c_stream, load_node_metadata_from_arrow_c_stream, - load_nodes_from_arrow_c_stream, + convert_py_prop_args, convert_py_schema, is_csv_path, + load_edge_deletions_from_arrow_c_stream, load_edge_deletions_from_csv_path, + load_edge_metadata_from_arrow_c_stream, load_edge_metadata_from_csv_path, + load_edges_from_arrow_c_stream, load_edges_from_csv_path, + load_node_metadata_from_arrow_c_stream, load_node_metadata_from_csv_path, + load_nodes_from_arrow_c_stream, load_nodes_from_csv_path, CsvReadOptions, }, node::PyNode, views::graph_view::PyGraphView, @@ -33,7 +33,7 @@ use crate::{ }, serialise::StableEncode, }; -use pyo3::{prelude::*, pybacked::PyBackedStr}; +use pyo3::{exceptions::PyValueError, prelude::*, pybacked::PyBackedStr}; use raphtory_api::{ core::{ entities::{properties::prop::Prop, GID}, @@ -46,6 +46,7 @@ use std::{ collections::HashMap, fmt::{Debug, Formatter}, path::PathBuf, + sync::Arc, }; /// A temporal graph that allows edges and nodes to be deleted. @@ -572,9 +573,10 @@ impl PyPersistentGraph { PyPersistentGraph::py_from_db_graph(self.graph.persistent_graph()) } - /// Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load nodes into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing the nodes. @@ -585,6 +587,8 @@ impl PyPersistentGraph { /// properties (List[str], optional): List of node property column names. Defaults to None. /// metadata (List[str], optional): List of node metadata column names. Defaults to None. /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -592,11 +596,11 @@ impl PyPersistentGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, time, id, node_type = None, node_type_col = None, properties = None, metadata= None, shared_metadata = None) + signature = (data, time, id, node_type = None, node_type_col = None, properties = None, metadata= None, shared_metadata = None, schema = None, csv_options = None) )] - fn load_nodes_from_df<'py>( + fn load_nodes( &self, - data: &Bound<'py, PyAny>, + data: &Bound, time: &str, id: &str, node_type: Option<&str>, @@ -604,125 +608,98 @@ impl PyPersistentGraph { properties: Option>, metadata: Option>, shared_metadata: Option>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_nodes_from_arrow_c_stream( - &self.graph, - data, - time, - id, - node_type, - node_type_col, - &properties, - &metadata, - shared_metadata.as_ref(), - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_nodes_from_arrow_c_stream( + &self.graph, + data, + time, + id, + node_type, + node_type_col, + &properties, + &metadata, + shared_metadata.as_ref(), + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load nodes from a Pandas DataFrame into the graph. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing the nodes. - /// time (str): The column name for the timestamps. - /// id (str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// properties (List[str], optional): List of node property column names. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (df,time,id, node_type = None, node_type_col = None, properties = None, metadata = None, shared_metadata = None))] - fn load_nodes_from_pandas( - &self, - df: &Bound, - time: &str, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_nodes_from_pandas( - &self.graph, - df, - time, - id, - node_type, - node_type_col, - &properties, - &metadata, - shared_metadata.as_ref(), - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load nodes from a Parquet file into the graph. - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files containing the nodes - /// time (str): The column name for the timestamps. - /// id (str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// properties (List[str], optional): List of node property column names. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (parquet_path, time,id, node_type = None, node_type_col = None, properties = None, metadata = None, shared_metadata = None))] - fn load_nodes_from_parquet( - &self, - parquet_path: PathBuf, - time: &str, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_nodes_from_parquet( - &self.graph, - parquet_path.as_path(), - time, - id, - node_type, - node_type_col, - &properties, - &metadata, - shared_metadata.as_ref(), - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_nodes_from_parquet( + &self.graph, + path.as_path(), + time, + id, + node_type, + node_type_col, + &properties, + &metadata, + shared_metadata.as_ref(), + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_nodes_from_csv_path( + &self.graph, + &path, + time, + id, + node_type, + node_type_col, + &properties, + &metadata, + shared_metadata.as_ref(), + csv_options.as_ref(), + arced_schema, + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } - /// Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load edges into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing the edges. /// time (str): The column name for the update timestamps. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. + /// src (str): The column name for the source node IDs. + /// dst (str): The column name for the destination node IDs. /// properties (List[str], optional): List of edge property column names. Defaults to None. /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. /// layer_col (str, optional): The edge layer column name in a dataframe. Cannot be used in combination with layer. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -730,9 +707,9 @@ impl PyPersistentGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None) + signature = (data, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None, schema = None, csv_options = None) )] - fn load_edges_from_df( + fn load_edges( &self, data: &Bound, time: &str, @@ -743,121 +720,88 @@ impl PyPersistentGraph { shared_metadata: Option>, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edges_from_arrow_c_stream( - &self.graph, - data, - time, - src, - dst, - &properties, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_edges_from_arrow_c_stream( + &self.graph, + data, + time, + src, + dst, + &properties, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load edges from a Pandas DataFrame into the graph. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing the edges. - /// time (str): The column name for the update timestamps. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. - /// properties (List[str], optional): List of edge property column names. Defaults to None. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (df, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None))] - fn load_edges_from_pandas( - &self, - df: &Bound, - time: &str, - src: &str, - dst: &str, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edges_from_pandas( - &self.graph, - df, - time, - src, - dst, - &properties, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load edges from a Parquet file into the graph. - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files path containing edges - /// time (str): The column name for the update timestamps. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. - /// properties (List[str], optional): List of edge property column names. Defaults to None. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (parquet_path, time, src, dst, properties = None, metadata = None, shared_metadata = None, layer = None, layer_col = None))] - fn load_edges_from_parquet( - &self, - parquet_path: PathBuf, - time: &str, - src: &str, - dst: &str, - properties: Option>, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edges_from_parquet( - &self.graph, - parquet_path.as_path(), - time, - src, - dst, - &properties, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_edges_from_parquet( + &self.graph, + &path, + time, + src, + dst, + &properties, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_edges_from_csv_path( + &self.graph, + &path, + time, + src, + dst, + &properties, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + csv_options.as_ref(), + arced_schema.clone(), + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } - /// Load edge deletions into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load edge deletions into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing the edges. @@ -866,14 +810,16 @@ impl PyPersistentGraph { /// dst (str): The column name for the destination node ids. /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. /// layer_col (str, optional): The edge layer col name in the data source. Cannot be used in combination with layer. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value, if the operation is successful. /// /// Raises: /// GraphError: If the operation fails. - #[pyo3(signature = (data, time, src, dst, layer = None, layer_col = None))] - fn load_edge_deletions_from_df( + #[pyo3(signature = (data, time, src, dst, layer = None, layer_col = None, schema = None, csv_options = None))] + fn load_edge_deletions( &self, data: &Bound, time: &str, @@ -881,78 +827,77 @@ impl PyPersistentGraph { dst: &str, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { - load_edge_deletions_from_arrow_c_stream(&self.graph, data, time, src, dst, layer, layer_col) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_edge_deletions_from_arrow_c_stream( + &self.graph, + data, + time, + src, + dst, + layer, + layer_col, + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load edges deletions from a Pandas DataFrame into the graph. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing the edges. - /// time (str): The column name for the update timestamps. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. - /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (df, time, src, dst, layer = None, layer_col = None))] - fn load_edge_deletions_from_pandas( - &self, - df: &Bound, - time: &str, - src: &str, - dst: &str, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - load_edge_deletions_from_pandas(&self.graph, df, time, src, dst, layer, layer_col) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load edges deletions from a Parquet file into the graph. - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files path containing node information. - /// src (str): The column name for the source node ids. - /// dst (str): The column name for the destination node ids. - /// time (str): The column name for the update timestamps. - /// layer (str, optional): A value to use as the layer for all edges. Cannot be used in combination with layer_col. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Cannot be used in combination with layer. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (parquet_path, time, src, dst, layer = None, layer_col = None))] - fn load_edge_deletions_from_parquet( - &self, - parquet_path: PathBuf, - time: &str, - src: &str, - dst: &str, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - load_edge_deletions_from_parquet( - &self.graph, - parquet_path.as_path(), - time, - src, - dst, - layer, - layer_col, - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_edge_deletions_from_parquet( + &self.graph, + path.as_path(), + time, + src, + dst, + layer, + layer_col, + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_edge_deletions_from_csv_path( + &self.graph, + &path, + time, + src, + dst, + layer, + layer_col, + csv_options.as_ref(), + arced_schema, + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } - /// Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load node metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing node information. @@ -961,6 +906,8 @@ impl PyPersistentGraph { /// node_type_col (str, optional): The node type column name in a dataframe. Cannot be used in combination with node_type. Defaults to None. /// metadata (List[str], optional): List of node metadata column names. Defaults to None. /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -968,9 +915,9 @@ impl PyPersistentGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, id, node_type = None, node_type_col = None, metadata = None, shared_metadata = None) + signature = (data, id, node_type = None, node_type_col = None, metadata = None, shared_metadata = None, schema = None, csv_options = None) )] - fn load_node_metadata_from_df( + fn load_node_metadata( &self, data: &Bound, id: &str, @@ -978,97 +925,78 @@ impl PyPersistentGraph { node_type_col: Option<&str>, metadata: Option>, shared_metadata: Option>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_node_metadata_from_arrow_c_stream( - &self.graph, - data, - id, - node_type, - node_type_col, - &metadata, - shared_metadata.as_ref(), - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_node_metadata_from_arrow_c_stream( + &self.graph, + data, + id, + node_type, + node_type_col, + &metadata, + shared_metadata.as_ref(), + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load node properties from a Pandas DataFrame. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing node information. - /// id(str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (df, id, node_type=None, node_type_col=None, metadata = None, shared_metadata = None))] - fn load_node_props_from_pandas( - &self, - df: &Bound, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_node_props_from_pandas( - &self.graph, - df, - id, - node_type, - node_type_col, - &metadata, - shared_metadata.as_ref(), - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load node properties from a parquet file. - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files path containing node information. - /// id(str): The column name for the node IDs. - /// node_type (str, optional): A value to use as the node type for all nodes. Cannot be used in combination with node_type_col. Defaults to None. - /// node_type_col (str, optional): The node type col name in dataframe. Cannot be used in combination with node_type. Defaults to None. - /// metadata (List[str], optional): List of node metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every node. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (parquet_path, id, node_type = None, node_type_col=None, metadata = None, shared_metadata = None))] - fn load_node_props_from_parquet( - &self, - parquet_path: PathBuf, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - metadata: Option>, - shared_metadata: Option>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_node_props_from_parquet( - &self.graph, - parquet_path.as_path(), - id, - node_type, - node_type_col, - &metadata, - shared_metadata.as_ref(), - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_node_metadata_from_parquet( + &self.graph, + path.as_path(), + id, + node_type, + node_type_col, + &metadata, + shared_metadata.as_ref(), + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_node_metadata_from_csv_path( + &self.graph, + &path, + id, + node_type, + node_type_col, + &metadata, + shared_metadata.as_ref(), + csv_options.as_ref(), + arced_schema, + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } - /// Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method). - /// This includes, but is not limited to: Pandas dataframes, FireDucks(.pandas) dataframes, - /// Polars dataframes, Arrow tables, DuckDB (eg. DuckDBPyRelation obtained from running an SQL query) + /// Load edge metadata into the graph from any data source that supports the ArrowStreamExportable protocol (by providing an __arrow_c_stream__() method), + /// a path to a CSV or Parquet file, or a directory containing multiple CSV or Parquet files. + /// The following are known to support the ArrowStreamExportable protocol: Pandas dataframes, FireDucks(.pandas) dataframes, + /// Polars dataframes, Arrow tables, DuckDB (e.g. DuckDBPyRelation obtained from running an SQL query). /// /// Arguments: /// data (Any): The data source containing edge information. @@ -1078,6 +1006,8 @@ impl PyPersistentGraph { /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. /// layer (str, optional): The edge layer name. Defaults to None. /// layer_col (str, optional): The edge layer column name in a dataframe. Defaults to None. + /// schema (list[tuple[str, DataType | PropType | str]] | dict[str, DataType | PropType | str], optional): A list of (column_name, column_type) tuples or dict of {"column_name": column_type} to cast columns to. Defaults to None. + /// csv_options (dict[str, str | bool], optional): A dictionary of CSV reading options such as delimiter, comment, escape, quote, and terminator characters, as well as allow_truncated_rows and has_header flags. Defaults to None. /// /// Returns: /// None: This function does not return a value if the operation is successful. @@ -1085,9 +1015,9 @@ impl PyPersistentGraph { /// Raises: /// GraphError: If the operation fails. #[pyo3( - signature = (data, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None) + signature = (data, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None, schema = None, csv_options = None) )] - fn load_edge_metadata_from_df( + fn load_edge_metadata( &self, data: &Bound, src: &str, @@ -1096,99 +1026,75 @@ impl PyPersistentGraph { shared_metadata: Option>, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, + csv_options: Option, ) -> Result<(), GraphError> { let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edge_metadata_from_arrow_c_stream( - &self.graph, - data, - src, - dst, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + let column_schema = convert_py_schema(schema)?; + if data.hasattr("__arrow_c_stream__")? { + load_edge_metadata_from_arrow_c_stream( + &self.graph, + data, + src, + dst, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + column_schema, + ) + } else if let Ok(path) = data.extract::() { + // extracting PathBuf handles Strings too + let is_parquet = is_parquet_path(&path)?; + let is_csv = is_csv_path(&path)?; - /// Load edge properties from a Pandas DataFrame. - /// - /// Arguments: - /// df (DataFrame): The Pandas DataFrame containing edge information. - /// src (str): The column name for the source node. - /// dst (str): The column name for the destination node. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): The edge layer name. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (df, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None))] - fn load_edge_props_from_pandas( - &self, - df: &Bound, - src: &str, - dst: &str, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edge_props_from_pandas( - &self.graph, - df, - src, - dst, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - ) - } + // fail before loading anything at all to avoid loading partial data + if !is_csv && csv_options.is_some() { + return Err(GraphError::from(PyValueError::new_err(format!( + "CSV options were passed but no CSV files were detected at {}.", + path.display() + )))); + } - /// Load edge properties from parquet file - /// - /// Arguments: - /// parquet_path (str): Parquet file or directory of Parquet files path containing edge information. - /// src (str): The column name for the source node. - /// dst (str): The column name for the destination node. - /// metadata (List[str], optional): List of edge metadata column names. Defaults to None. - /// shared_metadata (PropInput, optional): A dictionary of metadata properties that will be added to every edge. Defaults to None. - /// layer (str, optional): The edge layer name. Defaults to None. - /// layer_col (str, optional): The edge layer col name in dataframe. Defaults to None. - /// - /// Returns: - /// None: This function does not return a value, if the operation is successful. - /// - /// Raises: - /// GraphError: If the operation fails. - #[pyo3(signature = (parquet_path, src, dst, metadata = None, shared_metadata = None, layer = None, layer_col = None))] - fn load_edge_props_from_parquet( - &self, - parquet_path: PathBuf, - src: &str, - dst: &str, - metadata: Option>, - shared_metadata: Option>, - layer: Option<&str>, - layer_col: Option<&str>, - ) -> Result<(), GraphError> { - let metadata = convert_py_prop_args(metadata.as_deref()).unwrap_or_default(); - load_edge_props_from_parquet( - &self.graph, - parquet_path.as_path(), - src, - dst, - &metadata, - shared_metadata.as_ref(), - layer, - layer_col, - None, - ) + // wrap in Arc to avoid cloning the entire schema for Parquet, CSV, and inner loops in CSV path + let arced_schema = column_schema.map(Arc::new); + + // if-if instead of if-else to support directories with mixed parquet and CSV files + if is_parquet { + load_edge_metadata_from_parquet( + &self.graph, + path.as_path(), + src, + dst, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + None, + arced_schema.clone(), + )?; + } + if is_csv { + load_edge_metadata_from_csv_path( + &self.graph, + &path, + src, + dst, + &metadata, + shared_metadata.as_ref(), + layer, + layer_col, + csv_options.as_ref(), + arced_schema, + )?; + } + if !is_parquet && !is_csv { + return Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' contains invalid path. Paths must either point to a Parquet/CSV file, or a directory containing Parquet/CSV files"))); + } + Ok(()) + } else { + Err(GraphError::PythonError(PyValueError::new_err("Argument 'data' invalid. Valid data sources are: a single Parquet or CSV file, a directory containing Parquet or CSV files, and objects that implement an __arrow_c_stream__ method."))) + } } /// Create graph index diff --git a/raphtory/src/python/graph/io/arrow_loaders.rs b/raphtory/src/python/graph/io/arrow_loaders.rs index 403a084600..37ab3c0a8f 100644 --- a/raphtory/src/python/graph/io/arrow_loaders.rs +++ b/raphtory/src/python/graph/io/arrow_loaders.rs @@ -1,28 +1,67 @@ use crate::{ db::api::view::StaticGraphViewOps, errors::GraphError, - io::arrow::{ - dataframe::{DFChunk, DFView}, - df_loaders::{ - load_edge_deletions_from_df, load_edges_from_df, load_edges_props_from_df, - load_node_props_from_df, load_nodes_from_df, + io::{ + arrow::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + load_edge_deletions_from_df, load_edges_from_df, load_edges_props_from_df, + load_node_props_from_df, load_nodes_from_df, + }, }, + parquet_loaders::cast_columns, }, prelude::{AdditionOps, PropertyAdditionOps}, - python::graph::io::pandas_loaders::is_jupyter, serialise::incremental::InternalCache, }; use arrow::{ array::{RecordBatch, RecordBatchReader}, datatypes::SchemaRef, }; -use pyo3::{prelude::*, types::PyCapsule}; +use arrow_csv::{reader::Format, ReaderBuilder}; +use bzip2::read::BzDecoder; +use flate2::read::GzDecoder; +use pyo3::{ + exceptions::PyValueError, + ffi::c_str, + prelude::*, + pybacked::PyBackedStr, + types::{PyCapsule, PyDict}, +}; use pyo3_arrow::PyRecordBatchReader; -use raphtory_api::core::entities::properties::prop::Prop; -use std::{cmp::min, collections::HashMap}; +use raphtory_api::core::entities::properties::prop::{Prop, PropType}; +use std::{ + cmp::min, + collections::HashMap, + fs, + fs::File, + iter, + ops::Deref, + path::{Path, PathBuf}, + sync::Arc, +}; +use tracing::error; const CHUNK_SIZE: usize = 1_000_000; // split large chunks so progress bar updates reasonably +pub(crate) fn convert_py_prop_args(properties: Option<&[PyBackedStr]>) -> Option> { + properties.map(|p| p.iter().map(|p| p.deref()).collect()) +} + +pub(crate) fn convert_py_schema( + schema: Option>, +) -> Result>, GraphError> { + schema.map(|s| { + if let Ok(list) = s.extract::>() { + Ok(list.into_iter().collect::>()) + } else if let Ok(map) = s.extract::>() { + Ok(map) + } else { + Err(GraphError::from(PyValueError::new_err("Argument 'schema' must either be a list of (column_name, column_type) tuples or a dict mapping {'column_name' : column_type}"))) + } + }).transpose() +} + pub(crate) fn load_nodes_from_arrow_c_stream< 'py, G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, @@ -36,6 +75,7 @@ pub(crate) fn load_nodes_from_arrow_c_stream< properties: &[&str], metadata: &[&str], shared_metadata: Option<&HashMap>, + schema: Option>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id, time]; cols_to_check.extend_from_slice(properties); @@ -43,7 +83,7 @@ pub(crate) fn load_nodes_from_arrow_c_stream< if let Some(ref node_type_col) = node_type_col { cols_to_check.push(node_type_col.as_ref()); } - let df_view = process_arrow_c_stream_df(data, cols_to_check.clone())?; + let df_view = process_arrow_c_stream_df(data, cols_to_check.clone(), schema)?; df_view.check_cols_exist(&cols_to_check)?; load_nodes_from_df( df_view, @@ -72,6 +112,7 @@ pub(crate) fn load_edges_from_arrow_c_stream< shared_metadata: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst, time]; cols_to_check.extend_from_slice(properties); @@ -79,7 +120,7 @@ pub(crate) fn load_edges_from_arrow_c_stream< if let Some(layer_col) = layer_col { cols_to_check.push(layer_col.as_ref()); } - let df_view = process_arrow_c_stream_df(data, cols_to_check.clone())?; + let df_view = process_arrow_c_stream_df(data, cols_to_check.clone(), schema)?; df_view.check_cols_exist(&cols_to_check)?; load_edges_from_df( df_view, @@ -106,13 +147,14 @@ pub(crate) fn load_node_metadata_from_arrow_c_stream< node_type_col: Option<&str>, metadata: &[&str], shared_metadata: Option<&HashMap>, + schema: Option>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id]; cols_to_check.extend_from_slice(metadata); if let Some(ref node_type_col) = node_type_col { cols_to_check.push(node_type_col.as_ref()); } - let df_view = process_arrow_c_stream_df(data, cols_to_check.clone())?; + let df_view = process_arrow_c_stream_df(data, cols_to_check.clone(), schema)?; df_view.check_cols_exist(&cols_to_check)?; load_node_props_from_df( df_view, @@ -137,13 +179,14 @@ pub(crate) fn load_edge_metadata_from_arrow_c_stream< shared_metadata: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst]; if let Some(ref layer_col) = layer_col { cols_to_check.push(layer_col.as_ref()); } cols_to_check.extend_from_slice(metadata); - let df_view = process_arrow_c_stream_df(data, cols_to_check.clone())?; + let df_view = process_arrow_c_stream_df(data, cols_to_check.clone(), schema)?; df_view.check_cols_exist(&cols_to_check)?; load_edges_props_from_df( df_view, @@ -157,7 +200,7 @@ pub(crate) fn load_edge_metadata_from_arrow_c_stream< ) } -pub fn load_edge_deletions_from_arrow_c_stream< +pub(crate) fn load_edge_deletions_from_arrow_c_stream< 'py, G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, >( @@ -168,13 +211,14 @@ pub fn load_edge_deletions_from_arrow_c_stream< dst: &str, layer: Option<&str>, layer_col: Option<&str>, + schema: Option>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst, time]; if let Some(ref layer_col) = layer_col { cols_to_check.push(layer_col.as_ref()); } - let df_view = process_arrow_c_stream_df(data, cols_to_check.clone())?; + let df_view = process_arrow_c_stream_df(data, cols_to_check.clone(), schema)?; df_view.check_cols_exist(&cols_to_check)?; load_edge_deletions_from_df( df_view, @@ -191,6 +235,7 @@ pub fn load_edge_deletions_from_arrow_c_stream< pub(crate) fn process_arrow_c_stream_df<'a>( data: &Bound<'a, PyAny>, col_names: Vec<&str>, + schema: Option>, ) -> PyResult> + 'a>> { let py = data.py(); is_jupyter(py); @@ -225,11 +270,10 @@ pub(crate) fn process_arrow_c_stream_df<'a>( })?; // Get column names and indices once only - let schema: SchemaRef = reader.schema(); let mut names: Vec = Vec::with_capacity(col_names.len()); let mut indices: Vec = Vec::with_capacity(col_names.len()); - for (idx, field) in schema.fields().iter().enumerate() { + for (idx, field) in reader.schema().fields().iter().enumerate() { if col_names.contains(&field.name().as_str()) { names.push(field.name().clone()); indices.push(idx); @@ -245,7 +289,7 @@ pub(crate) fn process_arrow_c_stream_df<'a>( let chunks = reader .into_iter() .flat_map(move |batch_res: Result| { - let batch = match batch_res.map_err(|e| { + let batch: RecordBatch = match batch_res.map_err(|e| { GraphError::LoadFailure(format!( "Arrow stream error while reading a batch: {}", e.to_string() @@ -254,33 +298,531 @@ pub(crate) fn process_arrow_c_stream_df<'a>( Ok(batch) => batch, Err(e) => return vec![Err(e)], }; - let num_rows = batch.num_rows(); - - // many times, all the data will be passed as a single RecordBatch, meaning the progress bar - // will not update properly (only updates at the end of each batch). Splitting into smaller batches - // means the progress bar will update reasonably (every CHUNK_SIZE rows) - if num_rows > CHUNK_SIZE { - let num_chunks = (num_rows + CHUNK_SIZE - 1) / CHUNK_SIZE; - let mut result = Vec::with_capacity(num_chunks); - for i in 0..num_chunks { - let offset = i * CHUNK_SIZE; - let length = min(CHUNK_SIZE, num_rows - offset); - let sliced_batch = batch.slice(offset, length); - let chunk_arrays = indices - .iter() - .map(|&idx| sliced_batch.column(idx).clone()) - .collect::>(); - result.push(Ok(DFChunk::new(chunk_arrays))); + let casted_batch = if let Some(schema) = &schema { + match cast_columns(batch, schema) { + Ok(casted_batch) => casted_batch, + Err(e) => return vec![Err(e)], } - result } else { - let chunk_arrays = indices - .iter() - .map(|&idx| batch.column(idx).clone()) - .collect::>(); - vec![Ok(DFChunk::new(chunk_arrays))] - } + batch + }; + + split_into_chunks(&casted_batch, &indices) }); Ok(DFView::new(names, chunks, len_from_python)) } + +/// Splits a RecordBatch into chunks of CHUNK_SIZE owned by DFChunk objects +fn split_into_chunks(batch: &RecordBatch, indices: &[usize]) -> Vec> { + // many times, all the data will be passed as a single RecordBatch, meaning the progress bar + // will not update properly (only updates at the end of each batch). Splitting into smaller batches + // means the progress bar will update reasonably (every CHUNK_SIZE rows) + let num_rows = batch.num_rows(); + if num_rows > CHUNK_SIZE { + let num_chunks = (num_rows + CHUNK_SIZE - 1) / CHUNK_SIZE; + let mut result = Vec::with_capacity(num_chunks); + for i in 0..num_chunks { + let offset = i * CHUNK_SIZE; + let length = min(CHUNK_SIZE, num_rows - offset); + let sliced_batch = batch.slice(offset, length); + let chunk_arrays = indices + .iter() + .map(|&idx| sliced_batch.column(idx).clone()) + .collect::>(); + result.push(Ok(DFChunk::new(chunk_arrays))); + } + result + } else { + let chunk_arrays = indices + .iter() + .map(|&idx| batch.column(idx).clone()) + .collect::>(); + vec![Ok(DFChunk::new(chunk_arrays))] + } +} + +pub(crate) fn is_csv_path(path: &PathBuf) -> Result { + if path.is_dir() { + Ok(fs::read_dir(&path)?.any(|entry| { + entry.map_or(false, |e| { + let p = e.path(); + let s = p.to_string_lossy(); + s.ends_with(".csv") || s.ends_with(".csv.gz") || s.ends_with(".csv.bz2") + }) + })) + } else { + let path_str = path.to_string_lossy(); + Ok(path_str.ends_with(".csv") + || path_str.ends_with(".csv.gz") + || path_str.ends_with(".csv.bz2")) + } +} + +/// CSV options we support, passed as Python dict +pub(crate) struct CsvReadOptions { + delimiter: Option, + comment: Option, + escape: Option, + quote: Option, + terminator: Option, + allow_truncated_rows: Option, + has_header: Option, +} + +impl<'a> FromPyObject<'a> for CsvReadOptions { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { + let dict = ob.downcast::().map_err(|e| { + PyValueError::new_err(format!("CSV options should be passed as a dict: {e}")) + })?; + let get_char = |option: &str| match dict.get_item(option)? { + None => Ok(None), + Some(val) => { + if let Ok(s) = val.extract::() { + if s.len() != 1 { + return Err(PyValueError::new_err(format!( + "CSV option '{option}' must be a single character string or int 0-255", + ))); + } + Ok(Some(s.as_bytes()[0])) + } else if let Ok(b) = val.extract::() { + Ok(Some(b)) + } else { + return Err(PyValueError::new_err(format!( + "CSV option '{option}' must be a single character string or int 0-255", + ))); + } + } + }; + let get_bool = |option: &str| { + dict.get_item(option)? + .map(|val| val.extract::()) + .transpose() + .map_err(|_| PyValueError::new_err(format!("CSV option '{option}' must be a bool"))) + }; + + Ok(CsvReadOptions { + delimiter: get_char("delimiter")?, + comment: get_char("comment")?, + escape: get_char("escape")?, + quote: get_char("quote")?, + terminator: get_char("terminator")?, + allow_truncated_rows: get_bool("allow_truncated_rows")?, + has_header: get_bool("has_header")?, + }) + } +} + +fn collect_csv_paths(path: &PathBuf) -> Result, GraphError> { + let mut csv_paths = Vec::new(); + if path.is_dir() { + for entry in fs::read_dir(path)? { + let entry = entry?; + let p = entry.path(); + let s = p.to_string_lossy(); + if s.ends_with(".csv") || s.ends_with(".csv.gz") || s.ends_with(".csv.bz2") { + csv_paths.push(p); + } + } + } else { + csv_paths.push(path.clone()); + } + + if csv_paths.is_empty() { + return Err(GraphError::LoadFailure(format!( + "No CSV files found at path '{}'", + path.display() + ))); + } + Ok(csv_paths) +} + +// Load from CSV files using arrow-csv +pub(crate) fn load_nodes_from_csv_path< + 'py, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, +>( + graph: &G, + path: &PathBuf, + time: &str, + id: &str, + node_type: Option<&str>, + node_type_col: Option<&str>, + properties: &[&str], + metadata: &[&str], + shared_metadata: Option<&HashMap>, + csv_options: Option<&CsvReadOptions>, + schema: Option>>, +) -> Result<(), GraphError> { + let mut cols_to_check = vec![id, time]; + cols_to_check.extend_from_slice(properties); + cols_to_check.extend_from_slice(metadata); + if let Some(ref node_type_col) = node_type_col { + cols_to_check.push(node_type_col.as_ref()); + } + + // get the CSV file paths + let csv_paths = collect_csv_paths(path)?; + + let df_view = process_csv_paths_df(&csv_paths, cols_to_check.clone(), csv_options, schema)?; + df_view.check_cols_exist(&cols_to_check)?; + load_nodes_from_df( + df_view, + time, + id, + properties, + metadata, + shared_metadata, + node_type, + node_type_col, + graph, + ) +} + +pub(crate) fn load_edges_from_csv_path< + 'py, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, +>( + graph: &G, + path: &PathBuf, + time: &str, + src: &str, + dst: &str, + properties: &[&str], + metadata: &[&str], + shared_metadata: Option<&HashMap>, + layer: Option<&str>, + layer_col: Option<&str>, + csv_options: Option<&CsvReadOptions>, + schema: Option>>, +) -> Result<(), GraphError> { + let mut cols_to_check = vec![src, dst, time]; + cols_to_check.extend_from_slice(properties); + cols_to_check.extend_from_slice(metadata); + if let Some(layer_col) = layer_col { + cols_to_check.push(layer_col.as_ref()); + } + + // get the CSV file paths + let csv_paths = collect_csv_paths(path)?; + + let df_view = process_csv_paths_df(&csv_paths, cols_to_check.clone(), csv_options, schema)?; + df_view.check_cols_exist(&cols_to_check)?; + load_edges_from_df( + df_view, + time, + src, + dst, + properties, + metadata, + shared_metadata, + layer, + layer_col, + graph, + ) +} + +pub(crate) fn load_node_metadata_from_csv_path< + 'py, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, +>( + graph: &G, + path: &PathBuf, + id: &str, + node_type: Option<&str>, + node_type_col: Option<&str>, + metadata: &[&str], + shared_metadata: Option<&HashMap>, + csv_options: Option<&CsvReadOptions>, + schema: Option>>, +) -> Result<(), GraphError> { + let mut cols_to_check = vec![id]; + cols_to_check.extend_from_slice(metadata); + if let Some(ref node_type_col) = node_type_col { + cols_to_check.push(node_type_col.as_ref()); + } + + // get the CSV file paths + let csv_paths = collect_csv_paths(path)?; + + let df_view = process_csv_paths_df(&csv_paths, cols_to_check.clone(), csv_options, schema)?; + df_view.check_cols_exist(&cols_to_check)?; + load_node_props_from_df( + df_view, + id, + node_type, + node_type_col, + metadata, + shared_metadata, + graph, + ) +} + +pub(crate) fn load_edge_metadata_from_csv_path< + 'py, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, +>( + graph: &G, + path: &PathBuf, + src: &str, + dst: &str, + metadata: &[&str], + shared_metadata: Option<&HashMap>, + layer: Option<&str>, + layer_col: Option<&str>, + csv_options: Option<&CsvReadOptions>, + schema: Option>>, +) -> Result<(), GraphError> { + let mut cols_to_check = vec![src, dst]; + if let Some(ref layer_col) = layer_col { + cols_to_check.push(layer_col.as_ref()); + } + cols_to_check.extend_from_slice(metadata); + + // get the CSV file paths + let csv_paths = collect_csv_paths(path)?; + + let df_view = process_csv_paths_df(&csv_paths, cols_to_check.clone(), csv_options, schema)?; + df_view.check_cols_exist(&cols_to_check)?; + load_edges_props_from_df( + df_view, + src, + dst, + metadata, + shared_metadata, + layer, + layer_col, + graph, + ) +} + +pub(crate) fn load_edge_deletions_from_csv_path< + 'py, + G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, +>( + graph: &G, + path: &PathBuf, + time: &str, + src: &str, + dst: &str, + layer: Option<&str>, + layer_col: Option<&str>, + csv_options: Option<&CsvReadOptions>, + schema: Option>>, +) -> Result<(), GraphError> { + let mut cols_to_check = vec![src, dst, time]; + if let Some(ref layer_col) = layer_col { + cols_to_check.push(layer_col.as_ref()); + } + + // get the CSV file paths + let csv_paths = collect_csv_paths(path)?; + + let df_view = process_csv_paths_df(&csv_paths, cols_to_check.clone(), csv_options, schema)?; + df_view.check_cols_exist(&cols_to_check)?; + load_edge_deletions_from_df( + df_view, + time, + src, + dst, + layer, + layer_col, + graph.core_graph(), + ) +} + +fn get_csv_reader(filename: &str, file: File) -> Box { + // Support bz2 and gz compression + if filename.ends_with(".csv.gz") { + Box::new(GzDecoder::new(file)) + } else if filename.ends_with(".csv.bz2") { + Box::new(BzDecoder::new(file)) + } else { + // no need for a BufReader because ReaderBuilder::build internally wraps into BufReader + Box::new(file) + } +} + +fn build_csv_reader( + path: &Path, + csv_options: Option<&CsvReadOptions>, +) -> Result>, GraphError> { + let file = File::open(path)?; + let path_str = path.to_string_lossy(); + + let mut format = Format::default(); + + let has_header = csv_options.and_then(|o| o.has_header).unwrap_or(true); + format = format.with_header(has_header); + + if let Some(delim) = csv_options.and_then(|o| o.delimiter) { + format = format.with_delimiter(delim); + } + + if let Some(comment) = csv_options.and_then(|o| o.comment) { + format = format.with_comment(comment); + } + + if let Some(escape) = csv_options.and_then(|o| o.escape) { + format = format.with_escape(escape); + } + + if let Some(quote) = csv_options.and_then(|o| o.quote) { + format = format.with_quote(quote); + } + + if let Some(terminator) = csv_options.and_then(|o| o.terminator) { + format = format.with_terminator(terminator); + } + + if let Some(allow_truncated_rows) = csv_options.and_then(|o| o.allow_truncated_rows) { + format = format.with_truncated_rows(allow_truncated_rows); + } + + // infer schema + let reader = get_csv_reader(path_str.as_ref(), file); + let (schema, _) = format.infer_schema(reader, Some(100)).map_err(|e| { + GraphError::LoadFailure(format!( + "Arrow CSV error while inferring schema from '{}': {e}", + path.display() + )) + })?; + let schema_ref: SchemaRef = Arc::new(schema); + + // we need another reader because the first one gets consumed + let file = File::open(path)?; + let reader = get_csv_reader(path_str.as_ref(), file); + + let mut reader_builder = ReaderBuilder::new(schema_ref) + .with_header(has_header) + .with_batch_size(CHUNK_SIZE); + + if let Some(delimiter) = csv_options.and_then(|o| o.delimiter) { + reader_builder = reader_builder.with_delimiter(delimiter); + } + + if let Some(comment) = csv_options.and_then(|o| o.comment) { + reader_builder = reader_builder.with_comment(comment); + } + + if let Some(escape) = csv_options.and_then(|o| o.escape) { + reader_builder = reader_builder.with_escape(escape); + } + + if let Some(quote) = csv_options.and_then(|o| o.quote) { + reader_builder = reader_builder.with_quote(quote); + } + + if let Some(terminator) = csv_options.and_then(|o| o.terminator) { + reader_builder = reader_builder.with_terminator(terminator); + } + + if let Some(allow_truncated_rows) = csv_options.and_then(|o| o.allow_truncated_rows) { + reader_builder = reader_builder.with_truncated_rows(allow_truncated_rows); + } + + reader_builder.build(reader).map_err(|e| { + GraphError::LoadFailure(format!( + "Arrow CSV error while reading '{}': {e}", + path.display() + )) + }) +} + +fn process_csv_paths_df<'a>( + paths: &'a [PathBuf], + col_names: Vec<&'a str>, + csv_options: Option<&'a CsvReadOptions>, + schema: Option>>, +) -> Result> + 'a>, GraphError> { + if paths.is_empty() { + return Err(GraphError::LoadFailure( + "No CSV files found at the provided path".to_string(), + )); + } + // BoxedLIter couldn't be used because it has Send + Sync bound + type ChunkIter<'b> = Box> + 'b>; + + let names = col_names.iter().map(|&name| name.to_string()).collect(); + let chunks = paths.iter().flat_map(move |path| { + let schema = schema.clone(); + let csv_reader = match build_csv_reader(path.as_path(), csv_options) { + Ok(r) => r, + Err(e) => return Box::new(iter::once(Err(e))) as ChunkIter<'a>, + }; + let mut indices = Vec::with_capacity(col_names.len()); + for required_col in &col_names { + if let Some((idx, _)) = csv_reader + .schema() + .fields() + .iter() + .enumerate() + .find(|(_, f)| f.name() == required_col) + { + indices.push(idx); + } else { + return Box::new(iter::once(Err(GraphError::ColumnDoesNotExist( + required_col.to_string(), + )))) as ChunkIter<'a>; + } + } + Box::new( + csv_reader + .into_iter() + .map(move |batch_res| match batch_res { + Ok(batch) => { + let casted_batch = if let Some(schema) = schema.as_deref() { + cast_columns(batch, schema)? + } else { + batch + }; + let arrays = indices + .iter() + .map(|&idx| casted_batch.column(idx).clone()) + .collect::>(); + Ok(DFChunk::new(arrays)) + } + Err(e) => Err(GraphError::LoadFailure(format!( + "Arrow CSV error while reading a batch from '{}': {e}", + path.display() + ))), + }), + ) as ChunkIter<'a> + }); + + // we don't know the total number of rows until we read all files + Ok(DFView::new(names, chunks, None)) +} + +pub(crate) fn is_jupyter(py: Python) { + let code = c_str!( + r#" +try: + shell = get_ipython().__class__.__name__ + if shell == 'ZMQInteractiveShell': + result = True # Jupyter notebook or qtconsole + elif shell == 'TerminalInteractiveShell': + result = False # Terminal running IPython + else: + result = False # Other type, assuming not a Jupyter environment +except NameError: + result = False # Probably standard Python interpreter +"# + ); + + if let Err(e) = py.run(code, None, None) { + error!("Error checking if running in a jupyter notebook: {}", e); + return; + } + + match py.eval(c_str!("result"), None, None) { + Ok(x) => { + if let Ok(x) = x.extract() { + kdam::set_notebook(x); + } + } + Err(e) => { + error!("Error checking if running in a jupyter notebook: {}", e); + } + }; +} diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index b3b8faa385..187456c137 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -1,7 +1,6 @@ use pyo3::{create_exception, exceptions::PyException}; pub mod arrow_loaders; -pub mod pandas_loaders; create_exception!(exceptions, ArrowErrorException, PyException); create_exception!(exceptions, GraphLoadException, PyException); diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs deleted file mode 100644 index 1bac0e00db..0000000000 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ /dev/null @@ -1,287 +0,0 @@ -use crate::{ - db::api::view::StaticGraphViewOps, - errors::GraphError, - io::arrow::{dataframe::*, df_loaders::*}, - prelude::{AdditionOps, PropertyAdditionOps}, - serialise::incremental::InternalCache, -}; -use arrow::array::ArrayRef; -use pyo3::{ - ffi::c_str, - prelude::*, - pybacked::PyBackedStr, - types::{IntoPyDict, PyDict}, -}; -use pyo3_arrow::PyArray; -use raphtory_api::core::entities::properties::prop::Prop; -use std::{collections::HashMap, ops::Deref}; -use tracing::error; - -pub(crate) fn convert_py_prop_args(properties: Option<&[PyBackedStr]>) -> Option> { - properties.map(|p| p.iter().map(|p| p.deref()).collect()) -} - -pub(crate) fn load_nodes_from_pandas< - 'py, - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, ->( - graph: &G, - df: &Bound<'py, PyAny>, - time: &str, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - properties: &[&str], - metadata: &[&str], - shared_metadata: Option<&HashMap>, -) -> Result<(), GraphError> { - let mut cols_to_check = vec![id, time]; - cols_to_check.extend_from_slice(properties); - cols_to_check.extend_from_slice(metadata); - if let Some(ref node_type_col) = node_type_col { - cols_to_check.push(node_type_col.as_ref()); - } - - let df_view = process_pandas_py_df(df, cols_to_check.clone())?; - df_view.check_cols_exist(&cols_to_check)?; - load_nodes_from_df( - df_view, - time, - id, - properties, - metadata, - shared_metadata, - node_type, - node_type_col, - graph, - ) -} - -pub(crate) fn load_edges_from_pandas< - 'py, - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, ->( - graph: &G, - df: &Bound<'py, PyAny>, - time: &str, - src: &str, - dst: &str, - properties: &[&str], - metadata: &[&str], - shared_metadata: Option<&HashMap>, - layer: Option<&str>, - layer_col: Option<&str>, -) -> Result<(), GraphError> { - let mut cols_to_check = vec![src, dst, time]; - cols_to_check.extend_from_slice(properties); - cols_to_check.extend_from_slice(metadata); - if let Some(layer_col) = layer_col { - cols_to_check.push(layer_col.as_ref()); - } - - let df_view = process_pandas_py_df(df, cols_to_check.clone())?; - df_view.check_cols_exist(&cols_to_check)?; - load_edges_from_df( - df_view, - time, - src, - dst, - properties, - metadata, - shared_metadata, - layer, - layer_col, - graph, - ) -} - -pub(crate) fn load_node_props_from_pandas< - 'py, - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, ->( - graph: &G, - df: &Bound<'py, PyAny>, - id: &str, - node_type: Option<&str>, - node_type_col: Option<&str>, - metadata: &[&str], - shared_metadata: Option<&HashMap>, -) -> Result<(), GraphError> { - let mut cols_to_check = vec![id]; - cols_to_check.extend_from_slice(metadata); - if let Some(ref node_type_col) = node_type_col { - cols_to_check.push(node_type_col.as_ref()); - } - let df_view = process_pandas_py_df(df, cols_to_check.clone())?; - df_view.check_cols_exist(&cols_to_check)?; - load_node_props_from_df( - df_view, - id, - node_type, - node_type_col, - metadata, - shared_metadata, - graph, - ) -} - -pub(crate) fn load_edge_props_from_pandas< - 'py, - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps + InternalCache, ->( - graph: &G, - df: &Bound<'py, PyAny>, - src: &str, - dst: &str, - metadata: &[&str], - shared_metadata: Option<&HashMap>, - layer: Option<&str>, - layer_col: Option<&str>, -) -> Result<(), GraphError> { - let mut cols_to_check = vec![src, dst]; - if let Some(ref layer_col) = layer_col { - cols_to_check.push(layer_col.as_ref()); - } - cols_to_check.extend_from_slice(metadata); - let df_view = process_pandas_py_df(df, cols_to_check.clone())?; - df_view.check_cols_exist(&cols_to_check)?; - load_edges_props_from_df( - df_view, - src, - dst, - metadata, - shared_metadata, - layer, - layer_col, - graph, - ) -} - -pub fn load_edge_deletions_from_pandas< - 'py, - G: StaticGraphViewOps + PropertyAdditionOps + AdditionOps, ->( - graph: &G, - df: &Bound<'py, PyAny>, - time: &str, - src: &str, - dst: &str, - layer: Option<&str>, - layer_col: Option<&str>, -) -> Result<(), GraphError> { - let mut cols_to_check = vec![src, dst, time]; - if let Some(ref layer_col) = layer_col { - cols_to_check.push(layer_col.as_ref()); - } - - let df_view = process_pandas_py_df(df, cols_to_check.clone())?; - df_view.check_cols_exist(&cols_to_check)?; - load_edge_deletions_from_df( - df_view, - time, - src, - dst, - layer, - layer_col, - graph.core_graph(), - ) -} - -pub(crate) fn process_pandas_py_df<'a>( - df: &Bound<'a, PyAny>, - col_names: Vec<&str>, -) -> PyResult> + 'a>> { - let py = df.py(); - is_jupyter(py); - py.import("pandas")?; - let module = py.import("pyarrow")?; - let pa_table = module.getattr("Table")?; - - let df_columns: Vec = df.getattr("columns")?.extract()?; - - let cols_to_drop: Vec = df_columns - .into_iter() - .filter(|x| !col_names.contains(&x.as_str())) - .collect(); - - let dropped_df = if !cols_to_drop.is_empty() { - let drop_method = df.getattr("drop")?; - &drop_method.call((cols_to_drop,), Some(&vec![("axis", 1)].into_py_dict(py)?))? - } else { - df - }; - - let table = pa_table.call_method("from_pandas", (dropped_df.clone(),), None)?; - let kwargs = PyDict::new(py); - kwargs.set_item("max_chunksize", 1000000)?; - let rb = table - .call_method("to_batches", (), Some(&kwargs))? - .extract::>>()?; - let names: Vec = if let Some(batch0) = rb.first() { - let schema = batch0.getattr("schema")?; - schema.getattr("names")?.extract::>()? - } else { - vec![] - } - .into_iter() - .filter(|x| col_names.contains(&x.as_str())) - .collect(); - - let names_len = names.len(); - let chunks = rb.into_iter().map(move |rb| { - let chunk = (0..names_len) - .map(|i| { - let array = rb.call_method1("column", (i,)).map_err(GraphError::from)?; - let arr = array_to_rust(&array).map_err(GraphError::from)?; - Ok::<_, GraphError>(arr) - }) - .collect::, GraphError>>()?; - - Ok(DFChunk { chunk }) - }); - let num_rows: usize = dropped_df.call_method0("__len__")?.extract()?; - - Ok(DFView { - names, - chunks, - num_rows: Some(num_rows), - }) -} - -pub fn array_to_rust(obj: &Bound) -> PyResult { - let (array, _) = PyArray::extract_bound(obj)?.into_inner(); - Ok(array) -} - -pub(crate) fn is_jupyter(py: Python) { - let code = c_str!( - r#" -try: - shell = get_ipython().__class__.__name__ - if shell == 'ZMQInteractiveShell': - result = True # Jupyter notebook or qtconsole - elif shell == 'TerminalInteractiveShell': - result = False # Terminal running IPython - else: - result = False # Other type, assuming not a Jupyter environment -except NameError: - result = False # Probably standard Python interpreter -"# - ); - - if let Err(e) = py.run(code, None, None) { - error!("Error checking if running in a jupyter notebook: {}", e); - return; - } - - match py.eval(c_str!("result"), None, None) { - Ok(x) => { - if let Ok(x) = x.extract() { - kdam::set_notebook(x); - } - } - Err(e) => { - error!("Error checking if running in a jupyter notebook: {}", e); - } - }; -} diff --git a/raphtory/src/python/graph/properties/props.rs b/raphtory/src/python/graph/properties/props.rs index e250a08572..3772f966d1 100644 --- a/raphtory/src/python/graph/properties/props.rs +++ b/raphtory/src/python/graph/properties/props.rs @@ -24,7 +24,10 @@ use pyo3::{ exceptions::{PyKeyError, PyTypeError}, prelude::*, }; -use raphtory_api::core::{entities::properties::prop::Prop, storage::arc_str::ArcStr}; +use raphtory_api::core::{ + entities::properties::prop::{Prop, PropType}, + storage::arc_str::ArcStr, +}; use std::{collections::HashMap, ops::Deref, sync::Arc}; #[derive(Clone, Debug)] @@ -98,6 +101,18 @@ impl PyProperties { self.props.get(key) } + /// Get the PropType of a property. Specifically, returns the PropType of the latest value for this property if it exists. + /// If not, it returns the PropType for the static property matching this name. + /// + /// Arguments: + /// key (str): the name of the property. + /// + /// Returns: + /// PropType: + pub fn get_dtype_of(&self, key: &str) -> Option { + self.props.get(key).map(|p| p.dtype()) + } + /// Check if property `key` exists. /// /// Returns: diff --git a/raphtory/src/python/packages/base_modules.rs b/raphtory/src/python/packages/base_modules.rs index 7d22e19fc2..4451ec625d 100644 --- a/raphtory/src/python/packages/base_modules.rs +++ b/raphtory/src/python/packages/base_modules.rs @@ -59,7 +59,10 @@ use crate::{ }, }; use pyo3::prelude::*; -use raphtory_api::python::timeindex::{PyEventTime, PyOptionalEventTime}; +use raphtory_api::python::{ + prop::PyPropType, + timeindex::{PyEventTime, PyOptionalEventTime}, +}; pub fn add_raphtory_classes(m: &Bound) -> PyResult<()> { //Graph classes @@ -81,6 +84,7 @@ pub fn add_raphtory_classes(m: &Bound) -> PyResult<()> { PyMutableEdge, PyProperties, PyPropValueList, + PyPropType, PyMetadata, MetadataView, PyTemporalProperties, diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs index 83966bd597..b0a1a426d1 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet/mod.rs @@ -5,8 +5,8 @@ use crate::{ }, errors::GraphError, io::parquet_loaders::{ - load_edge_deletions_from_parquet, load_edge_props_from_parquet, load_edges_from_parquet, - load_graph_props_from_parquet, load_node_props_from_parquet, load_nodes_from_parquet, + load_edge_deletions_from_parquet, load_edge_metadata_from_parquet, load_edges_from_parquet, + load_graph_props_from_parquet, load_node_metadata_from_parquet, load_nodes_from_parquet, }, prelude::*, serialise::parquet::{ @@ -260,7 +260,15 @@ fn decode_graph_storage( let exclude = vec![TIME_COL]; let (c_props, g_type) = collect_prop_columns(&c_graph_path, &exclude)?; let c_props = c_props.iter().map(|s| s.as_str()).collect::>(); - load_graph_props_from_parquet(&g, &c_graph_path, TIME_COL, &[], &c_props, batch_size)?; + load_graph_props_from_parquet( + &g, + &c_graph_path, + TIME_COL, + &[], + &c_props, + batch_size, + None, + )?; g_type.ok_or_else(|| GraphError::LoadFailure("Graph type not found".to_string()))? }; @@ -278,7 +286,15 @@ fn decode_graph_storage( let exclude = vec![TIME_COL]; let (t_props, _) = collect_prop_columns(&t_graph_path, &exclude)?; let t_props = t_props.iter().map(|s| s.as_str()).collect::>(); - load_graph_props_from_parquet(&g, &t_graph_path, TIME_COL, &t_props, &[], batch_size)?; + load_graph_props_from_parquet( + &g, + &t_graph_path, + TIME_COL, + &t_props, + &[], + batch_size, + None, + )?; } let t_node_path = path.as_ref().join(NODES_T_PATH); @@ -301,6 +317,7 @@ fn decode_graph_storage( &[], None, batch_size, + None, )?; } @@ -313,7 +330,7 @@ fn decode_graph_storage( .map(|s| s.as_str()) .collect::>(); - load_node_props_from_parquet( + load_node_metadata_from_parquet( &g, &c_node_path, NODE_ID, @@ -322,6 +339,7 @@ fn decode_graph_storage( &c_prop_columns, None, batch_size, + None, )?; } @@ -346,6 +364,7 @@ fn decode_graph_storage( None, Some(LAYER_COL), batch_size, + None, )?; } @@ -360,6 +379,7 @@ fn decode_graph_storage( None, Some(LAYER_COL), batch_size, + None, )?; } @@ -371,7 +391,7 @@ fn decode_graph_storage( .map(|s| s.as_str()) .collect::>(); - load_edge_props_from_parquet( + load_edge_metadata_from_parquet( &g, &c_edge_path, SRC_COL, @@ -381,6 +401,7 @@ fn decode_graph_storage( None, Some(LAYER_COL), batch_size, + None, )?; }