kuzu

Kuzu Python API bindings.

This package provides a Python API for Kuzu graph database management system.

To install the package, run:

python3 -m pip install kuzu

Example usage:

import kuzu

db = kuzu.Database("./test")
conn = kuzu.Connection(db)

# Define the schema
conn.execute("CREATE NODE TABLE User(name STRING, age INT64, PRIMARY KEY (name))")
conn.execute("CREATE NODE TABLE City(name STRING, population INT64, PRIMARY KEY (name))")
conn.execute("CREATE REL TABLE Follows(FROM User TO User, since INT64)")
conn.execute("CREATE REL TABLE LivesIn(FROM User TO City)")

# Load some data
conn.execute('COPY User FROM "user.csv"')
conn.execute('COPY City FROM "city.csv"')
conn.execute('COPY Follows FROM "follows.csv"')
conn.execute('COPY LivesIn FROM "lives-in.csv"')

# Query the data
results = conn.execute("MATCH (u:User) RETURN u.name, u.age;")
while results.has_next():
    print(results.get_next())

The dataset used in this example can be found here.

View Source

 1"""
 2# Kuzu Python API bindings.
 3
 4This package provides a Python API for Kuzu graph database management system.
 5
 6To install the package, run:
 7```
 8python3 -m pip install kuzu
 9```
10
11Example usage:
12```python
13import kuzu
14
15db = kuzu.Database("./test")
16conn = kuzu.Connection(db)
17
18# Define the schema
19conn.execute("CREATE NODE TABLE User(name STRING, age INT64, PRIMARY KEY (name))")
20conn.execute("CREATE NODE TABLE City(name STRING, population INT64, PRIMARY KEY (name))")
21conn.execute("CREATE REL TABLE Follows(FROM User TO User, since INT64)")
22conn.execute("CREATE REL TABLE LivesIn(FROM User TO City)")
23
24# Load some data
25conn.execute('COPY User FROM "user.csv"')
26conn.execute('COPY City FROM "city.csv"')
27conn.execute('COPY Follows FROM "follows.csv"')
28conn.execute('COPY LivesIn FROM "lives-in.csv"')
29
30# Query the data
31results = conn.execute("MATCH (u:User) RETURN u.name, u.age;")
32while results.has_next():
33    print(results.get_next())
34```
35
36The dataset used in this example can be found [here](https://github.com/kuzudb/kuzu/tree/master/dataset/demo-db/csv).
37
38"""
39
40from __future__ import annotations
41
42import os
43import sys
44
45# Set RTLD_GLOBAL and RTLD_LAZY flags on Linux to fix the issue with loading
46# extensions
47if sys.platform == "linux":
48    original_dlopen_flags = sys.getdlopenflags()
49    sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_LAZY)
50
51from .async_connection import AsyncConnection
52from .connection import Connection
53from .database import Database
54from .prepared_statement import PreparedStatement
55from .query_result import QueryResult
56from .types import Type
57
58
59def __getattr__(name: str) -> str | int:
60    if name in ("version", "__version__"):
61        return Database.get_version()
62    elif name == "storage_version":
63        return Database.get_storage_version()
64    else:
65        msg = f"module {__name__!r} has no attribute {name!r}"
66        raise AttributeError(msg)
67
68
69# Restore the original dlopen flags
70if sys.platform == "linux":
71    sys.setdlopenflags(original_dlopen_flags)
72
73__all__ = [
74    "AsyncConnection",
75    "Connection",
76    "Database",
77    "PreparedStatement",
78    "QueryResult",
79    "Type",
80    "__version__",  # noqa: F822
81    "storage_version",  # noqa: F822
82    "version",  # noqa: F822
83]

class AsyncConnection: View Source

 26class AsyncConnection:
 27    """AsyncConnection enables asynchronous execution of queries with a pool of connections and threads."""
 28
 29    def __init__(
 30        self,
 31        database: Database,
 32        max_concurrent_queries: int = 4,
 33        max_threads_per_query: int = 0,
 34    ) -> None:
 35        """
 36        Initialise the async connection.
 37
 38        Parameters
 39        ----------
 40        database : Database
 41            Database to connect to.
 42
 43        max_concurrent_queries : int
 44            Maximum number of concurrent queries to execute. This corresponds to the
 45            number of connections and thread pool size. Default is 4.
 46
 47        max_threads_per_query : int
 48            Controls the maximum number of threads per connection that can be used
 49            to execute one query. Default is 0, which means no limit.
 50        """
 51        self.database = database
 52        self.connections = [Connection(database) for _ in range(max_concurrent_queries)]
 53        self.connections_counter = [0 for _ in range(max_concurrent_queries)]
 54        self.lock = threading.Lock()
 55
 56        for conn in self.connections:
 57            conn.init_connection()
 58            conn.set_max_threads_for_exec(max_threads_per_query)
 59
 60        self.executor = ThreadPoolExecutor(max_workers=max_concurrent_queries)
 61
 62    def __enter__(self) -> Self:
 63        return self
 64
 65    def __exit__(
 66        self,
 67        exc_type: type[BaseException] | None,
 68        exc_value: BaseException | None,
 69        exc_traceback: TracebackType | None,
 70    ) -> None:
 71        self.close()
 72
 73    def __del__(self) -> None:
 74        self.close()
 75
 76    def __get_connection_with_least_queries(self) -> tuple[Connection, int]:
 77        with self.lock:
 78            conn_index = self.connections_counter.index(min(self.connections_counter))
 79            self.connections_counter[conn_index] += 1
 80        return self.connections[conn_index], conn_index
 81
 82    def __decrement_connection_counter(self, conn_index: int) -> None:
 83        """Decrement the query counter for a connection."""
 84        with self.lock:
 85            self.connections_counter[conn_index] -= 1
 86            if self.connections_counter[conn_index] < 0:
 87                self.connections_counter[conn_index] = 0
 88
 89    def acquire_connection(self) -> Connection:
 90        """
 91        Acquire a connection from the connection pool for temporary synchronous
 92        calls. If the connection pool is oversubscribed, the method will return
 93        the connection with the least number of queued queries. It is required
 94        to release the connection by calling `release_connection` after the
 95        connection is no longer needed.
 96
 97        Returns
 98        -------
 99        Connection
100            A connection object.
101        """
102        conn, _ = self.__get_connection_with_least_queries()
103        return conn
104
105    def release_connection(self, conn: Connection) -> None:
106        """
107        Release a connection acquired by `acquire_connection` back to the
108        connection pool. Calling this method is required when the connection is
109        no longer needed.
110
111        Parameters
112        ----------
113        conn : Connection
114            Connection object to release.
115
116
117        """
118        for i, existing_conn in enumerate(self.connections):
119            if existing_conn == conn:
120                self.__decrement_connection_counter(i)
121                break
122
123    def set_query_timeout(self, timeout_in_ms: int) -> None:
124        """
125        Set the query timeout value in ms for executing queries.
126
127        Parameters
128        ----------
129        timeout_in_ms : int
130            query timeout value in ms for executing queries.
131
132        """
133        for conn in self.connections:
134            conn.set_query_timeout(timeout_in_ms)
135
136    async def execute(
137        self, query: str | PreparedStatement, parameters: dict[str, Any] | None = None
138    ) -> QueryResult | list[QueryResult]:
139        """
140        Execute a query asynchronously.
141
142        Parameters
143        ----------
144        query : str | PreparedStatement
145            A prepared statement or a query string.
146            If a query string is given, a prepared statement will be created
147            automatically.
148
149        parameters : dict[str, Any]
150            Parameters for the query.
151
152        Returns
153        -------
154        QueryResult
155            Query result.
156
157        """
158        loop = asyncio.get_running_loop()
159        # If the query is a prepared statement, use the connection associated with it
160        if isinstance(query, PreparedStatement):
161            conn = query._connection
162            for i, existing_conn in enumerate(self.connections):
163                if existing_conn == conn:
164                    conn_index = i
165                    with self.lock:
166                        self.connections_counter[conn_index] += 1
167                    break
168        else:
169            conn, conn_index = self.__get_connection_with_least_queries()
170
171        try:
172            return await loop.run_in_executor(self.executor, conn.execute, query, parameters)
173        except asyncio.CancelledError:
174            conn.interrupt()
175        finally:
176            self.__decrement_connection_counter(conn_index)
177
178    async def _prepare(self, query: str, parameters: dict[str, Any] | None = None) -> PreparedStatement:
179        """
180        The only parameters supported during prepare are dataframes.
181        Any remaining parameters will be ignored and should be passed to execute().
182        """  # noqa: D401
183        loop = asyncio.get_running_loop()
184        conn, conn_index = self.__get_connection_with_least_queries()
185
186        try:
187            preparedStatement = await loop.run_in_executor(self.executor, conn.prepare, query, parameters)
188            return preparedStatement
189        finally:
190            self.__decrement_connection_counter(conn_index)
191
192    async def prepare(self, query: str, parameters: dict[str, Any] | None = None) -> PreparedStatement:
193        """
194        Create a prepared statement for a query asynchronously.
195
196        Parameters
197        ----------
198        query : str
199            Query to prepare.
200        parameters : dict[str, Any]
201            Parameters for the query.
202
203        Returns
204        -------
205        PreparedStatement
206            Prepared statement.
207
208        """
209        warnings.warn(
210            "The use of separate prepare + execute of queries is deprecated. "
211            "Please using a single call to the execute() API instead.",
212            DeprecationWarning,
213            stacklevel=2,
214        )
215        return await self._prepare(query, parameters)
216
217    def close(self) -> None:
218        """
219        Close all connections and shutdown the thread pool.
220
221        Note: Call to this method is optional. The connections and thread pool
222        will be closed automatically when the instance is garbage collected.
223        """
224        for conn in self.connections:
225            conn.close()
226
227        self.executor.shutdown(wait=True)

AsyncConnection enables asynchronous execution of queries with a pool of connections and threads.

AsyncConnection( database: Database, max_concurrent_queries: int = 4, max_threads_per_query: int = 0) View Source

29    def __init__(
30        self,
31        database: Database,
32        max_concurrent_queries: int = 4,
33        max_threads_per_query: int = 0,
34    ) -> None:
35        """
36        Initialise the async connection.
37
38        Parameters
39        ----------
40        database : Database
41            Database to connect to.
42
43        max_concurrent_queries : int
44            Maximum number of concurrent queries to execute. This corresponds to the
45            number of connections and thread pool size. Default is 4.
46
47        max_threads_per_query : int
48            Controls the maximum number of threads per connection that can be used
49            to execute one query. Default is 0, which means no limit.
50        """
51        self.database = database
52        self.connections = [Connection(database) for _ in range(max_concurrent_queries)]
53        self.connections_counter = [0 for _ in range(max_concurrent_queries)]
54        self.lock = threading.Lock()
55
56        for conn in self.connections:
57            conn.init_connection()
58            conn.set_max_threads_for_exec(max_threads_per_query)
59
60        self.executor = ThreadPoolExecutor(max_workers=max_concurrent_queries)

Initialise the async connection.

Parameters

database (Database): Database to connect to.
max_concurrent_queries (int): Maximum number of concurrent queries to execute. This corresponds to the number of connections and thread pool size. Default is 4.
max_threads_per_query (int): Controls the maximum number of threads per connection that can be used to execute one query. Default is 0, which means no limit.

database

connections

connections_counter

lock

executor

def acquire_connection(self) -> Connection: View Source

 89    def acquire_connection(self) -> Connection:
 90        """
 91        Acquire a connection from the connection pool for temporary synchronous
 92        calls. If the connection pool is oversubscribed, the method will return
 93        the connection with the least number of queued queries. It is required
 94        to release the connection by calling `release_connection` after the
 95        connection is no longer needed.
 96
 97        Returns
 98        -------
 99        Connection
100            A connection object.
101        """
102        conn, _ = self.__get_connection_with_least_queries()
103        return conn

Acquire a connection from the connection pool for temporary synchronous calls. If the connection pool is oversubscribed, the method will return the connection with the least number of queued queries. It is required to release the connection by calling release_connection after the connection is no longer needed.

Returns

Connection: A connection object.

def release_connection(self, conn: Connection) -> None: View Source

105    def release_connection(self, conn: Connection) -> None:
106        """
107        Release a connection acquired by `acquire_connection` back to the
108        connection pool. Calling this method is required when the connection is
109        no longer needed.
110
111        Parameters
112        ----------
113        conn : Connection
114            Connection object to release.
115
116
117        """
118        for i, existing_conn in enumerate(self.connections):
119            if existing_conn == conn:
120                self.__decrement_connection_counter(i)
121                break

Release a connection acquired by acquire_connection back to the connection pool. Calling this method is required when the connection is no longer needed.

Parameters

conn (Connection): Connection object to release.

def set_query_timeout(self, timeout_in_ms: int) -> None: View Source

123    def set_query_timeout(self, timeout_in_ms: int) -> None:
124        """
125        Set the query timeout value in ms for executing queries.
126
127        Parameters
128        ----------
129        timeout_in_ms : int
130            query timeout value in ms for executing queries.
131
132        """
133        for conn in self.connections:
134            conn.set_query_timeout(timeout_in_ms)

Set the query timeout value in ms for executing queries.

Parameters

timeout_in_ms (int): query timeout value in ms for executing queries.

async def execute( self, query: str | PreparedStatement, parameters: dict[str, typing.Any] | None = None) -> QueryResult | list[QueryResult]: View Source

136    async def execute(
137        self, query: str | PreparedStatement, parameters: dict[str, Any] | None = None
138    ) -> QueryResult | list[QueryResult]:
139        """
140        Execute a query asynchronously.
141
142        Parameters
143        ----------
144        query : str | PreparedStatement
145            A prepared statement or a query string.
146            If a query string is given, a prepared statement will be created
147            automatically.
148
149        parameters : dict[str, Any]
150            Parameters for the query.
151
152        Returns
153        -------
154        QueryResult
155            Query result.
156
157        """
158        loop = asyncio.get_running_loop()
159        # If the query is a prepared statement, use the connection associated with it
160        if isinstance(query, PreparedStatement):
161            conn = query._connection
162            for i, existing_conn in enumerate(self.connections):
163                if existing_conn == conn:
164                    conn_index = i
165                    with self.lock:
166                        self.connections_counter[conn_index] += 1
167                    break
168        else:
169            conn, conn_index = self.__get_connection_with_least_queries()
170
171        try:
172            return await loop.run_in_executor(self.executor, conn.execute, query, parameters)
173        except asyncio.CancelledError:
174            conn.interrupt()
175        finally:
176            self.__decrement_connection_counter(conn_index)

Execute a query asynchronously.

Parameters

query (str | PreparedStatement): A prepared statement or a query string. If a query string is given, a prepared statement will be created automatically.
parameters (dict[str, Any]): Parameters for the query.

Returns

QueryResult: Query result.

async def prepare( self, query: str, parameters: dict[str, typing.Any] | None = None) -> PreparedStatement: View Source

192    async def prepare(self, query: str, parameters: dict[str, Any] | None = None) -> PreparedStatement:
193        """
194        Create a prepared statement for a query asynchronously.
195
196        Parameters
197        ----------
198        query : str
199            Query to prepare.
200        parameters : dict[str, Any]
201            Parameters for the query.
202
203        Returns
204        -------
205        PreparedStatement
206            Prepared statement.
207
208        """
209        warnings.warn(
210            "The use of separate prepare + execute of queries is deprecated. "
211            "Please using a single call to the execute() API instead.",
212            DeprecationWarning,
213            stacklevel=2,
214        )
215        return await self._prepare(query, parameters)

Create a prepared statement for a query asynchronously.

Parameters

query (str): Query to prepare.
parameters (dict[str, Any]): Parameters for the query.

Returns

PreparedStatement: Prepared statement.

def close(self) -> None: View Source

217    def close(self) -> None:
218        """
219        Close all connections and shutdown the thread pool.
220
221        Note: Call to this method is optional. The connections and thread pool
222        will be closed automatically when the instance is garbage collected.
223        """
224        for conn in self.connections:
225            conn.close()
226
227        self.executor.shutdown(wait=True)

Close all connections and shutdown the thread pool.

Note: Call to this method is optional. The connections and thread pool will be closed automatically when the instance is garbage collected.

class Database: View Source

 26class Database:
 27    """Kuzu database instance."""
 28
 29    def __init__(
 30        self,
 31        database_path: str | Path | None = None,
 32        *,
 33        buffer_pool_size: int = 0,
 34        max_num_threads: int = 0,
 35        compression: bool = True,
 36        lazy_init: bool = False,
 37        read_only: bool = False,
 38        max_db_size: int = (1 << 43),
 39        auto_checkpoint: bool = True,
 40        checkpoint_threshold: int = -1,
 41    ):
 42        """
 43        Parameters
 44        ----------
 45        database_path : str, Path
 46            The path to database files. If the path is not specified, or empty, or equal to `:memory:`, the database
 47            will be created in memory.
 48
 49        buffer_pool_size : int
 50            The maximum size of buffer pool in bytes. Defaults to ~80% of system memory.
 51
 52        max_num_threads : int
 53            The maximum number of threads to use for executing queries.
 54
 55        compression : bool
 56            Enable database compression.
 57
 58        lazy_init : bool
 59            If True, the database will not be initialized until the first query.
 60            This is useful when the database is not used in the main thread or
 61            when the main process is forked.
 62            Default to False.
 63
 64        read_only : bool
 65            If true, the database is opened read-only. No write transactions is
 66            allowed on the `Database` object. Multiple read-only `Database`
 67            objects can be created with the same database path. However, there
 68            cannot be multiple `Database` objects created with the same
 69            database path.
 70            Default to False.
 71
 72        max_db_size : int
 73            The maximum size of the database in bytes. Note that this is introduced
 74            temporarily for now to get around with the default 8TB mmap address
 75             space limit some environment. This will be removed once we implemente
 76             a better solution later. The value is default to 1 << 43 (8TB) under 64-bit
 77             environment and 1GB under 32-bit one.
 78
 79        auto_checkpoint: bool
 80            If true, the database will automatically checkpoint when the size of
 81            the WAL file exceeds the checkpoint threshold.
 82
 83        checkpoint_threshold: int
 84            The threshold of the WAL file size in bytes. When the size of the
 85            WAL file exceeds this threshold, the database will checkpoint if autoCheckpoint is true.
 86
 87        """
 88        if database_path is None:
 89            database_path = ":memory:"
 90        if isinstance(database_path, Path):
 91            database_path = str(database_path)
 92
 93        self.database_path = database_path
 94        self.buffer_pool_size = buffer_pool_size
 95        self.max_num_threads = max_num_threads
 96        self.compression = compression
 97        self.read_only = read_only
 98        self.max_db_size = max_db_size
 99        self.auto_checkpoint = auto_checkpoint
100        self.checkpoint_threshold = checkpoint_threshold
101        self.is_closed = False
102
103        self._database: Any = None  # (type: _kuzu.Database from pybind11)
104        if not lazy_init:
105            self.init_database()
106
107    def __enter__(self) -> Self:
108        return self
109
110    def __exit__(
111        self,
112        exc_type: type[BaseException] | None,
113        exc_value: BaseException | None,
114        exc_traceback: TracebackType | None,
115    ) -> None:
116        self.close()
117
118    @staticmethod
119    def get_version() -> str:
120        """
121        Get the version of the database.
122
123        Returns
124        -------
125        str
126            The version of the database.
127        """
128        return _kuzu.Database.get_version()  # type: ignore[union-attr]
129
130    @staticmethod
131    def get_storage_version() -> int:
132        """
133        Get the storage version of the database.
134
135        Returns
136        -------
137        int
138            The storage version of the database.
139        """
140        return _kuzu.Database.get_storage_version()  # type: ignore[union-attr]
141
142    def __getstate__(self) -> dict[str, Any]:
143        state = {
144            "database_path": self.database_path,
145            "buffer_pool_size": self.buffer_pool_size,
146            "compression": self.compression,
147            "read_only": self.read_only,
148            "_database": None,
149        }
150        return state
151
152    def init_database(self) -> None:
153        """Initialize the database."""
154        self.check_for_database_close()
155        if self._database is None:
156            self._database = _kuzu.Database(  # type: ignore[union-attr]
157                self.database_path,
158                self.buffer_pool_size,
159                self.max_num_threads,
160                self.compression,
161                self.read_only,
162                self.max_db_size,
163                self.auto_checkpoint,
164                self.checkpoint_threshold,
165            )
166
167    def get_torch_geometric_remote_backend(
168        self, num_threads: int | None = None
169    ) -> tuple[KuzuFeatureStore, KuzuGraphStore]:
170        """
171        Use the database as the remote backend for torch_geometric.
172
173        For the interface of the remote backend, please refer to
174        https://pytorch-geometric.readthedocs.io/en/latest/advanced/remote.html.
175        The current implementation is read-only and does not support edge
176        features. The IDs of the nodes are based on the internal IDs (i.e., node
177        offsets). For the remote node IDs to be consistent with the positions in
178        the output tensors, please ensure that no deletion has been performed
179        on the node tables.
180
181        The remote backend can also be plugged into the data loader of
182        torch_geometric, which is useful for mini-batch training. For example:
183
184        ```python
185            loader_kuzu = NeighborLoader(
186                data=(feature_store, graph_store),
187                num_neighbors={('paper', 'cites', 'paper'): [12, 12, 12]},
188                batch_size=LOADER_BATCH_SIZE,
189                input_nodes=('paper', input_nodes),
190                num_workers=4,
191                filter_per_worker=False,
192            )
193        ```
194
195        Please note that the database instance is not fork-safe, so if more than
196        one worker is used, `filter_per_worker` must be set to False.
197
198        Parameters
199        ----------
200        num_threads : int
201            Number of threads to use for data loading. Default to None, which
202            means using the number of CPU cores.
203
204        Returns
205        -------
206        feature_store : KuzuFeatureStore
207            Feature store compatible with torch_geometric.
208        graph_store : KuzuGraphStore
209            Graph store compatible with torch_geometric.
210        """
211        self.check_for_database_close()
212        from .torch_geometric_feature_store import KuzuFeatureStore
213        from .torch_geometric_graph_store import KuzuGraphStore
214
215        return (
216            KuzuFeatureStore(self, num_threads),
217            KuzuGraphStore(self, num_threads),
218        )
219
220    def _scan_node_table(
221        self,
222        table_name: str,
223        prop_name: str,
224        prop_type: str,
225        dim: int,
226        indices: IndexType,
227        num_threads: int,
228    ) -> NDArray[Any]:
229        self.check_for_database_close()
230        import numpy as np
231
232        """
233        Scan a node table from storage directly, bypassing query engine.
234        Used internally by torch_geometric remote backend only.
235        """
236        self.init_database()
237        indices_cast = np.array(indices, dtype=np.uint64)
238        result = None
239
240        if prop_type == Type.INT64.value:
241            result = np.empty(len(indices) * dim, dtype=np.int64)
242            self._database.scan_node_table_as_int64(table_name, prop_name, indices_cast, result, num_threads)
243        elif prop_type == Type.INT32.value:
244            result = np.empty(len(indices) * dim, dtype=np.int32)
245            self._database.scan_node_table_as_int32(table_name, prop_name, indices_cast, result, num_threads)
246        elif prop_type == Type.INT16.value:
247            result = np.empty(len(indices) * dim, dtype=np.int16)
248            self._database.scan_node_table_as_int16(table_name, prop_name, indices_cast, result, num_threads)
249        elif prop_type == Type.DOUBLE.value:
250            result = np.empty(len(indices) * dim, dtype=np.float64)
251            self._database.scan_node_table_as_double(table_name, prop_name, indices_cast, result, num_threads)
252        elif prop_type == Type.FLOAT.value:
253            result = np.empty(len(indices) * dim, dtype=np.float32)
254            self._database.scan_node_table_as_float(table_name, prop_name, indices_cast, result, num_threads)
255
256        if result is not None:
257            return result
258
259        msg = f"Unsupported property type: {prop_type}"
260        raise ValueError(msg)
261
262    def close(self) -> None:
263        """
264        Close the database. Once the database is closed, the lock on the database
265        files is released and the database can be opened in another process.
266
267        Note: Call to this method is not required. The Python garbage collector
268        will automatically close the database when no references to the database
269        object exist. It is recommended not to call this method explicitly. If you
270        decide to manually close the database, make sure that all the QueryResult
271        and Connection objects are closed before calling this method.
272        """
273        if self.is_closed:
274            return
275        self.is_closed = True
276        if self._database is not None:
277            self._database.close()
278            self._database: Any = None  # (type: _kuzu.Database from pybind11)
279
280    def check_for_database_close(self) -> None:
281        """
282        Check if the database is closed and raise an exception if it is.
283
284        Raises
285        ------
286        Exception
287            If the database is closed.
288
289        """
290        if not self.is_closed:
291            return
292        msg = "Database is closed"
293        raise RuntimeError(msg)

Kuzu database instance.

Database( database_path: str | pathlib.Path | None = None, *, buffer_pool_size: int = 0, max_num_threads: int = 0, compression: bool = True, lazy_init: bool = False, read_only: bool = False, max_db_size: int = 8796093022208, auto_checkpoint: bool = True, checkpoint_threshold: int = -1) View Source

 29    def __init__(
 30        self,
 31        database_path: str | Path | None = None,
 32        *,
 33        buffer_pool_size: int = 0,
 34        max_num_threads: int = 0,
 35        compression: bool = True,
 36        lazy_init: bool = False,
 37        read_only: bool = False,
 38        max_db_size: int = (1 << 43),
 39        auto_checkpoint: bool = True,
 40        checkpoint_threshold: int = -1,
 41    ):
 42        """
 43        Parameters
 44        ----------
 45        database_path : str, Path
 46            The path to database files. If the path is not specified, or empty, or equal to `:memory:`, the database
 47            will be created in memory.
 48
 49        buffer_pool_size : int
 50            The maximum size of buffer pool in bytes. Defaults to ~80% of system memory.
 51
 52        max_num_threads : int
 53            The maximum number of threads to use for executing queries.
 54
 55        compression : bool
 56            Enable database compression.
 57
 58        lazy_init : bool
 59            If True, the database will not be initialized until the first query.
 60            This is useful when the database is not used in the main thread or
 61            when the main process is forked.
 62            Default to False.
 63
 64        read_only : bool
 65            If true, the database is opened read-only. No write transactions is
 66            allowed on the `Database` object. Multiple read-only `Database`
 67            objects can be created with the same database path. However, there
 68            cannot be multiple `Database` objects created with the same
 69            database path.
 70            Default to False.
 71
 72        max_db_size : int
 73            The maximum size of the database in bytes. Note that this is introduced
 74            temporarily for now to get around with the default 8TB mmap address
 75             space limit some environment. This will be removed once we implemente
 76             a better solution later. The value is default to 1 << 43 (8TB) under 64-bit
 77             environment and 1GB under 32-bit one.
 78
 79        auto_checkpoint: bool
 80            If true, the database will automatically checkpoint when the size of
 81            the WAL file exceeds the checkpoint threshold.
 82
 83        checkpoint_threshold: int
 84            The threshold of the WAL file size in bytes. When the size of the
 85            WAL file exceeds this threshold, the database will checkpoint if autoCheckpoint is true.
 86
 87        """
 88        if database_path is None:
 89            database_path = ":memory:"
 90        if isinstance(database_path, Path):
 91            database_path = str(database_path)
 92
 93        self.database_path = database_path
 94        self.buffer_pool_size = buffer_pool_size
 95        self.max_num_threads = max_num_threads
 96        self.compression = compression
 97        self.read_only = read_only
 98        self.max_db_size = max_db_size
 99        self.auto_checkpoint = auto_checkpoint
100        self.checkpoint_threshold = checkpoint_threshold
101        self.is_closed = False
102
103        self._database: Any = None  # (type: _kuzu.Database from pybind11)
104        if not lazy_init:
105            self.init_database()

Parameters

database_path (str, Path): The path to database files. If the path is not specified, or empty, or equal to :memory:, the database will be created in memory.
buffer_pool_size (int): The maximum size of buffer pool in bytes. Defaults to ~80% of system memory.
max_num_threads (int): The maximum number of threads to use for executing queries.
compression (bool): Enable database compression.
lazy_init (bool): If True, the database will not be initialized until the first query. This is useful when the database is not used in the main thread or when the main process is forked. Default to False.
read_only (bool): If true, the database is opened read-only. No write transactions is allowed on the Database object. Multiple read-only Database objects can be created with the same database path. However, there cannot be multiple Database objects created with the same database path. Default to False.
max_db_size (int): The maximum size of the database in bytes. Note that this is introduced temporarily for now to get around with the default 8TB mmap address space limit some environment. This will be removed once we implemente a better solution later. The value is default to 1 << 43 (8TB) under 64-bit environment and 1GB under 32-bit one.
auto_checkpoint (bool): If true, the database will automatically checkpoint when the size of the WAL file exceeds the checkpoint threshold.
checkpoint_threshold (int): The threshold of the WAL file size in bytes. When the size of the WAL file exceeds this threshold, the database will checkpoint if autoCheckpoint is true.

database_path

buffer_pool_size

max_num_threads

compression

read_only

max_db_size

auto_checkpoint

checkpoint_threshold

is_closed

@staticmethod

def get_version() -> str: View Source

118    @staticmethod
119    def get_version() -> str:
120        """
121        Get the version of the database.
122
123        Returns
124        -------
125        str
126            The version of the database.
127        """
128        return _kuzu.Database.get_version()  # type: ignore[union-attr]

Get the version of the database.

Returns

str: The version of the database.

@staticmethod

def get_storage_version() -> int: View Source

130    @staticmethod
131    def get_storage_version() -> int:
132        """
133        Get the storage version of the database.
134
135        Returns
136        -------
137        int
138            The storage version of the database.
139        """
140        return _kuzu.Database.get_storage_version()  # type: ignore[union-attr]

Get the storage version of the database.

Returns

int: The storage version of the database.

def init_database(self) -> None: View Source

152    def init_database(self) -> None:
153        """Initialize the database."""
154        self.check_for_database_close()
155        if self._database is None:
156            self._database = _kuzu.Database(  # type: ignore[union-attr]
157                self.database_path,
158                self.buffer_pool_size,
159                self.max_num_threads,
160                self.compression,
161                self.read_only,
162                self.max_db_size,
163                self.auto_checkpoint,
164                self.checkpoint_threshold,
165            )

Initialize the database.

def get_torch_geometric_remote_backend( self, num_threads: int | None = None) -> tuple[kuzu.torch_geometric_feature_store.KuzuFeatureStore, kuzu.torch_geometric_graph_store.KuzuGraphStore]: View Source

167    def get_torch_geometric_remote_backend(
168        self, num_threads: int | None = None
169    ) -> tuple[KuzuFeatureStore, KuzuGraphStore]:
170        """
171        Use the database as the remote backend for torch_geometric.
172
173        For the interface of the remote backend, please refer to
174        https://pytorch-geometric.readthedocs.io/en/latest/advanced/remote.html.
175        The current implementation is read-only and does not support edge
176        features. The IDs of the nodes are based on the internal IDs (i.e., node
177        offsets). For the remote node IDs to be consistent with the positions in
178        the output tensors, please ensure that no deletion has been performed
179        on the node tables.
180
181        The remote backend can also be plugged into the data loader of
182        torch_geometric, which is useful for mini-batch training. For example:
183
184        ```python
185            loader_kuzu = NeighborLoader(
186                data=(feature_store, graph_store),
187                num_neighbors={('paper', 'cites', 'paper'): [12, 12, 12]},
188                batch_size=LOADER_BATCH_SIZE,
189                input_nodes=('paper', input_nodes),
190                num_workers=4,
191                filter_per_worker=False,
192            )
193        ```
194
195        Please note that the database instance is not fork-safe, so if more than
196        one worker is used, `filter_per_worker` must be set to False.
197
198        Parameters
199        ----------
200        num_threads : int
201            Number of threads to use for data loading. Default to None, which
202            means using the number of CPU cores.
203
204        Returns
205        -------
206        feature_store : KuzuFeatureStore
207            Feature store compatible with torch_geometric.
208        graph_store : KuzuGraphStore
209            Graph store compatible with torch_geometric.
210        """
211        self.check_for_database_close()
212        from .torch_geometric_feature_store import KuzuFeatureStore
213        from .torch_geometric_graph_store import KuzuGraphStore
214
215        return (
216            KuzuFeatureStore(self, num_threads),
217            KuzuGraphStore(self, num_threads),
218        )

Use the database as the remote backend for torch_geometric.

For the interface of the remote backend, please refer to https://pytorch-geometric.readthedocs.io/en/latest/advanced/remote.html. The current implementation is read-only and does not support edge features. The IDs of the nodes are based on the internal IDs (i.e., node offsets). For the remote node IDs to be consistent with the positions in the output tensors, please ensure that no deletion has been performed on the node tables.

The remote backend can also be plugged into the data loader of torch_geometric, which is useful for mini-batch training. For example:

    loader_kuzu = NeighborLoader(
        data=(feature_store, graph_store),
        num_neighbors={('paper', 'cites', 'paper'): [12, 12, 12]},
        batch_size=LOADER_BATCH_SIZE,
        input_nodes=('paper', input_nodes),
        num_workers=4,
        filter_per_worker=False,
    )

Please note that the database instance is not fork-safe, so if more than one worker is used, filter_per_worker must be set to False.

Parameters

num_threads (int): Number of threads to use for data loading. Default to None, which means using the number of CPU cores.

Returns

feature_store (KuzuFeatureStore): Feature store compatible with torch_geometric.
graph_store (KuzuGraphStore): Graph store compatible with torch_geometric.

def close(self) -> None: View Source

262    def close(self) -> None:
263        """
264        Close the database. Once the database is closed, the lock on the database
265        files is released and the database can be opened in another process.
266
267        Note: Call to this method is not required. The Python garbage collector
268        will automatically close the database when no references to the database
269        object exist. It is recommended not to call this method explicitly. If you
270        decide to manually close the database, make sure that all the QueryResult
271        and Connection objects are closed before calling this method.
272        """
273        if self.is_closed:
274            return
275        self.is_closed = True
276        if self._database is not None:
277            self._database.close()
278            self._database: Any = None  # (type: _kuzu.Database from pybind11)

Close the database. Once the database is closed, the lock on the database files is released and the database can be opened in another process.

Note: Call to this method is not required. The Python garbage collector will automatically close the database when no references to the database object exist. It is recommended not to call this method explicitly. If you decide to manually close the database, make sure that all the QueryResult and Connection objects are closed before calling this method.

def check_for_database_close(self) -> None: View Source

280    def check_for_database_close(self) -> None:
281        """
282        Check if the database is closed and raise an exception if it is.
283
284        Raises
285        ------
286        Exception
287            If the database is closed.
288
289        """
290        if not self.is_closed:
291            return
292        msg = "Database is closed"
293        raise RuntimeError(msg)

Check if the database is closed and raise an exception if it is.

Raises

Exception: If the database is closed.

class PreparedStatement: View Source

10class PreparedStatement:
11    """
12    A prepared statement is a parameterized query which can avoid planning the
13    same query for repeated execution.
14    """
15
16    def __init__(self, connection: Connection, query: str, parameters: dict[str, Any] | None = None):
17        """
18        Parameters
19        ----------
20        connection : Connection
21            Connection to a database.
22        query : str
23            Query to prepare.
24        parameters : dict[str, Any]
25            Parameters for the query.
26        """
27        if parameters is None:
28            parameters = {}
29        self._prepared_statement = connection._connection.prepare(query, parameters)
30        self._connection = connection
31
32    def is_success(self) -> bool:
33        """
34        Check if the prepared statement is successfully prepared.
35
36        Returns
37        -------
38        bool
39            True if the prepared statement is successfully prepared.
40        """
41        return self._prepared_statement.is_success()
42
43    def get_error_message(self) -> str:
44        """
45        Get the error message if the query is not prepared successfully.
46
47        Returns
48        -------
49        str
50            Error message.
51        """
52        return self._prepared_statement.get_error_message()

A prepared statement is a parameterized query which can avoid planning the same query for repeated execution.

PreparedStatement( connection: Connection, query: str, parameters: dict[str, typing.Any] | None = None) View Source

16    def __init__(self, connection: Connection, query: str, parameters: dict[str, Any] | None = None):
17        """
18        Parameters
19        ----------
20        connection : Connection
21            Connection to a database.
22        query : str
23            Query to prepare.
24        parameters : dict[str, Any]
25            Parameters for the query.
26        """
27        if parameters is None:
28            parameters = {}
29        self._prepared_statement = connection._connection.prepare(query, parameters)
30        self._connection = connection

Parameters

connection (Connection): Connection to a database.
query (str): Query to prepare.
parameters (dict[str, Any]): Parameters for the query.

def is_success(self) -> bool: View Source

32    def is_success(self) -> bool:
33        """
34        Check if the prepared statement is successfully prepared.
35
36        Returns
37        -------
38        bool
39            True if the prepared statement is successfully prepared.
40        """
41        return self._prepared_statement.is_success()

Check if the prepared statement is successfully prepared.

Returns

bool: True if the prepared statement is successfully prepared.

def get_error_message(self) -> str: View Source

43    def get_error_message(self) -> str:
44        """
45        Get the error message if the query is not prepared successfully.
46
47        Returns
48        -------
49        str
50            Error message.
51        """
52        return self._prepared_statement.get_error_message()

Get the error message if the query is not prepared successfully.

Returns

str: Error message.

class Type(enum.Enum): View Source

 5class Type(Enum):
 6    """The type of a value in the database."""
 7
 8    ANY = "ANY"
 9    NODE = "NODE"
10    REL = "REL"
11    RECURSIVE_REL = "RECURSIVE_REL"
12    SERIAL = "SERIAL"
13    BOOL = "BOOL"
14    INT64 = "INT64"
15    INT32 = "INT32"
16    INT16 = "INT16"
17    INT8 = "INT8"
18    UINT64 = "UINT64"
19    UINT32 = "UINT32"
20    UINT16 = "UINT16"
21    UINT8 = "UINT8"
22    INT128 = "INT128"
23    DOUBLE = "DOUBLE"
24    FLOAT = "FLOAT"
25    DATE = "DATE"
26    TIMESTAMP = "TIMESTAMP"
27    TIMSTAMP_TZ = "TIMESTAMP_TZ"
28    TIMESTAMP_NS = "TIMESTAMP_NS"
29    TIMESTAMP_MS = "TIMESTAMP_MS"
30    TIMESTAMP_SEC = "TIMESTAMP_SEC"
31    INTERVAL = "INTERVAL"
32    INTERNAL_ID = "INTERNAL_ID"
33    STRING = "STRING"
34    BLOB = "BLOB"
35    UUID = "UUID"
36    LIST = "LIST"
37    ARRAY = "ARRAY"
38    STRUCT = "STRUCT"
39    MAP = "MAP"
40    UNION = "UNION"

The type of a value in the database.

ANY = <Type.ANY: 'ANY'>

NODE = <Type.NODE: 'NODE'>

REL = <Type.REL: 'REL'>

RECURSIVE_REL = <Type.RECURSIVE_REL: 'RECURSIVE_REL'>

SERIAL = <Type.SERIAL: 'SERIAL'>

BOOL = <Type.BOOL: 'BOOL'>

INT64 = <Type.INT64: 'INT64'>

INT32 = <Type.INT32: 'INT32'>

INT16 = <Type.INT16: 'INT16'>

INT8 = <Type.INT8: 'INT8'>

UINT64 = <Type.UINT64: 'UINT64'>

UINT32 = <Type.UINT32: 'UINT32'>

UINT16 = <Type.UINT16: 'UINT16'>

UINT8 = <Type.UINT8: 'UINT8'>

INT128 = <Type.INT128: 'INT128'>

DOUBLE = <Type.DOUBLE: 'DOUBLE'>

FLOAT = <Type.FLOAT: 'FLOAT'>

DATE = <Type.DATE: 'DATE'>

TIMESTAMP = <Type.TIMESTAMP: 'TIMESTAMP'>

TIMSTAMP_TZ = <Type.TIMSTAMP_TZ: 'TIMESTAMP_TZ'>

TIMESTAMP_NS = <Type.TIMESTAMP_NS: 'TIMESTAMP_NS'>

TIMESTAMP_MS = <Type.TIMESTAMP_MS: 'TIMESTAMP_MS'>

TIMESTAMP_SEC = <Type.TIMESTAMP_SEC: 'TIMESTAMP_SEC'>

INTERVAL = <Type.INTERVAL: 'INTERVAL'>

INTERNAL_ID = <Type.INTERNAL_ID: 'INTERNAL_ID'>

STRING = <Type.STRING: 'STRING'>

BLOB = <Type.BLOB: 'BLOB'>

UUID = <Type.UUID: 'UUID'>

LIST = <Type.LIST: 'LIST'>

ARRAY = <Type.ARRAY: 'ARRAY'>

STRUCT = <Type.STRUCT: 'STRUCT'>

MAP = <Type.MAP: 'MAP'>

UNION = <Type.UNION: 'UNION'>

__version__

storage_version

version

kuzu

Kuzu Python API bindings.

Parameters

Returns

Parameters

Parameters

Parameters

Returns

Parameters

Returns

Parameters

Parameters

Parameters

Returns

Parameters

Returns

Parameters

Parameters

Parameters

Parameters

Returns

Returns

Parameters

Returns

Raises

Parameters

Returns

Returns

Parameters

Returns

Returns

Raises

Returns

Returns

Raises

See Also

Returns

See Also

Returns

Parameters

See Also

Returns

Returns

Returns

Returns

Parameters

Returns

Returns

Returns

Returns

Returns

Parameters

Returns