From 2a1ba3061c8f2dc02f337cd82cfcabd9d2cdba25 Mon Sep 17 00:00:00 2001 From: ColinLee Date: Wed, 14 Jan 2026 22:37:25 +0800 Subject: [PATCH 01/13] support save dataframe to tsfile. --- python/tests/test_dataframe.py | 251 +++++++++++++++++++ python/tests/test_to_tsfile.py | 345 +++++++++++++++++++++++++++ python/tests/test_write_and_read.py | 238 +----------------- python/tsfile/__init__.py | 2 +- python/tsfile/constants.py | 80 ++++++- python/tsfile/tsfile_cpp.pxd | 3 +- python/tsfile/tsfile_py_cpp.pxd | 1 + python/tsfile/tsfile_py_cpp.pyx | 184 +++++++++++++- python/tsfile/tsfile_table_writer.py | 75 +++++- python/tsfile/tsfile_writer.pyx | 30 ++- python/tsfile/utils.py | 106 ++++++++ 11 files changed, 1052 insertions(+), 263 deletions(-) create mode 100644 python/tests/test_dataframe.py create mode 100644 python/tests/test_to_tsfile.py diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py new file mode 100644 index 000000000..5138968a2 --- /dev/null +++ b/python/tests/test_dataframe.py @@ -0,0 +1,251 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import os + +import numpy as np +import pandas as pd +import pytest +from pandas.core.dtypes.common import is_integer_dtype + +from tsfile import ColumnSchema, TableSchema, TSDataType +from tsfile import TsFileTableWriter, ColumnCategory +from tsfile import to_dataframe +from tsfile.exceptions import ColumnNotExistError, TypeMismatchError + + +def convert_to_nullable_types(df): + """ + Convert DataFrame columns to nullable types to match returned DataFrame from to_dataframe. + This handles the fact that returned DataFrames use nullable types (Int64, Float64, etc.) + to support Null values. + """ + df = df.copy() + for col in df.columns: + dtype = df[col].dtype + if dtype == 'int64': + df[col] = df[col].astype('Int64') + elif dtype == 'int32': + df[col] = df[col].astype('Int32') + elif dtype == 'float64': + df[col] = df[col].astype('Float64') + elif dtype == 'float32': + df[col] = df[col].astype('Float32') + elif dtype == 'bool': + df[col] = df[col].astype('boolean') + return df + + +def test_write_dataframe_basic(): + table = TableSchema("test_table", + [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("value2", TSDataType.INT64, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_basic.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'time': [i for i in range(100)], + 'device': [f"device{i}" for i in range(100)], + 'value': [i * 1.5 for i in range(100)], + 'value2': [i * 10 for i in range(100)] + }) + writer.write_dataframe(df) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + assert df_read.shape == (100, 4) + assert df_read["time"].equals(df_sorted["time"]) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["value"].equals(df_sorted["value"]) + assert df_read["value2"].equals(df_sorted["value2"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_write_dataframe_with_index(): + table = TableSchema("test_table", + [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_index.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'device': [f"device{i}" for i in range(50)], + 'value': [i * 2.5 for i in range(50)] + }) + df.index = [i * 10 for i in range(50)] # Set index as timestamps + writer.write_dataframe(df) + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = df.sort_index() + df_sorted = convert_to_nullable_types(df_sorted.reset_index(drop=True)) + time_series = pd.Series(df.sort_index().index.values, dtype='Int64') + assert df_read.shape == (50, 3) + assert df_read["time"].equals(time_series) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["value"].equals(df_sorted["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_write_dataframe_case_insensitive(): + table = TableSchema("test_table", + [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_case.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'Time': [i for i in range(30)], # Capital T + 'Device': [f"device{i}" for i in range(30)], # Capital D + 'VALUE': [i * 3.0 for i in range(30)] # All caps + }) + writer.write_dataframe(df) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('Time').reset_index(drop=True)) + assert df_read.shape == (30, 3) + assert df_read["time"].equals(df_sorted["Time"]) + assert df_read["device"].equals(df_sorted["Device"]) + assert df_read["value"].equals(df_sorted["VALUE"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_write_dataframe_column_not_in_schema(): + table = TableSchema("test_table", + [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_extra_col.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'time': [i for i in range(10)], + 'device': [f"device{i}" for i in range(10)], + 'value': [i * 1.0 for i in range(10)], + 'extra_column': [i for i in range(10)] # Not in schema + }) + with pytest.raises(ColumnNotExistError) as exc_info: + writer.write_dataframe(df) + assert "extra_column" in str(exc_info.value) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_write_dataframe_type_mismatch(): + table = TableSchema("test_table", + [ColumnSchema("value", TSDataType.STRING, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_type_mismatch.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'time': [i for i in range(10)], + 'value': [i for i in range(10)] # INT64, but schema expects STRING + }) + with pytest.raises(TypeMismatchError) as exc_info: + writer.write_dataframe(df) + assert "Type mismatches" in str(exc_info.value) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_write_dataframe_all_datatypes(): + table = TableSchema("test_table", + [ColumnSchema("bool_col", TSDataType.BOOLEAN, ColumnCategory.FIELD), + ColumnSchema("int32_col", TSDataType.INT32, ColumnCategory.FIELD), + ColumnSchema("int64_col", TSDataType.INT64, ColumnCategory.FIELD), + ColumnSchema("float_col", TSDataType.FLOAT, ColumnCategory.FIELD), + ColumnSchema("double_col", TSDataType.DOUBLE, ColumnCategory.FIELD), + ColumnSchema("string_col", TSDataType.STRING, ColumnCategory.FIELD), + ColumnSchema("blob_col", TSDataType.BLOB, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_all_types.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'time': [i for i in range(50)], + 'bool_col': [i % 2 == 0 for i in range(50)], + 'int32_col': pd.Series([i for i in range(50)], dtype='int32'), + 'int64_col': [i * 10 for i in range(50)], + 'float_col': pd.Series([i * 1.5 for i in range(50)], dtype='float32'), + 'double_col': [i * 2.5 for i in range(50)], + 'string_col': [f"str{i}" for i in range(50)], + 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)] + }) + writer.write_dataframe(df) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + assert df_read.shape == (50, 8) + assert df_read["bool_col"].equals(df_sorted["bool_col"]) + assert df_read["int32_col"].equals(df_sorted["int32_col"]) + assert df_read["int64_col"].equals(df_sorted["int64_col"]) + assert np.allclose(df_read["float_col"], df_sorted["float_col"]) + assert np.allclose(df_read["double_col"], df_sorted["double_col"]) + assert df_read["string_col"].equals(df_sorted["string_col"]) + for i in range(50): + assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i] + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_write_dataframe_empty(): + table = TableSchema("test_table", + [ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_empty.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'time': [], + 'value': [] + }) + with pytest.raises(ValueError) as err: + writer.write_dataframe(df) + + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py new file mode 100644 index 000000000..0928f1a94 --- /dev/null +++ b/python/tests/test_to_tsfile.py @@ -0,0 +1,345 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import os + +import numpy as np +import pandas as pd +import pytest + +from tsfile import to_dataframe +from tsfile.utils import dataframe_to_tsfile + + +def convert_to_nullable_types(df): + df = df.copy() + for col in df.columns: + dtype = df[col].dtype + if dtype == 'int64': + df[col] = df[col].astype('Int64') + elif dtype == 'int32': + df[col] = df[col].astype('Int32') + elif dtype == 'float64': + df[col] = df[col].astype('Float64') + elif dtype == 'float32': + df[col] = df[col].astype('Float32') + elif dtype == 'bool': + df[col] = df[col].astype('boolean') + return df + + +def test_dataframe_to_tsfile_basic(): + tsfile_path = "test_dataframe_to_tsfile_basic.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [i for i in range(100)], + 'device': [f"device{i}" for i in range(100)], + 'value': [i * 1.5 for i in range(100)], + 'value2': [i * 10 for i in range(100)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table") + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + + assert df_read.shape == (100, 4) + assert df_read["time"].equals(df_sorted["time"]) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["value"].equals(df_sorted["value"]) + assert df_read["value2"].equals(df_sorted["value2"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_with_index(): + tsfile_path = "test_dataframe_to_tsfile_index.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'device': [f"device{i}" for i in range(50)], + 'value': [i * 2.5 for i in range(50)] + }) + df.index = [i * 10 for i in range(50)] + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table") + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = df.sort_index() + df_sorted = convert_to_nullable_types(df_sorted.reset_index(drop=True)) + time_series = pd.Series(df.sort_index().index.values, dtype='Int64') + + assert df_read.shape == (50, 3) + assert df_read["time"].equals(time_series) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["value"].equals(df_sorted["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_custom_time_column(): + tsfile_path = "test_dataframe_to_tsfile_custom_time.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'timestamp': [i for i in range(30)], + 'device': [f"device{i}" for i in range(30)], + 'value': [i * 3.0 for i in range(30)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table", time_column="timestamp") + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('timestamp').reset_index(drop=True)) + + assert df_read.shape == (30, 3) + assert df_read["time"].equals(df_sorted["timestamp"]) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["value"].equals(df_sorted["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_with_tag_columns(): + tsfile_path = "test_dataframe_to_tsfile_tags.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [i for i in range(20)], + 'device': [f"device{i}" for i in range(20)], + 'location': [f"loc{i % 5}" for i in range(20)], + 'value': [i * 1.5 for i in range(20)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device", "location"]) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + + assert df_read.shape == (20, 4) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["location"].equals(df_sorted["location"]) + assert df_read["value"].equals(df_sorted["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_all_datatypes(): + tsfile_path = "test_dataframe_to_tsfile_all_types.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [i for i in range(50)], + 'bool_col': [i % 2 == 0 for i in range(50)], + 'int32_col': pd.Series([i for i in range(50)], dtype='int32'), + 'int64_col': [i * 10 for i in range(50)], + 'float_col': pd.Series([i * 1.5 for i in range(50)], dtype='float32'), + 'double_col': [i * 2.5 for i in range(50)], + 'string_col': [f"str{i}" for i in range(50)], + 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table") + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + + assert df_read.shape == (50, 8) + assert df_read["bool_col"].equals(df_sorted["bool_col"]) + assert df_read["int32_col"].equals(df_sorted["int32_col"]) + assert df_read["int64_col"].equals(df_sorted["int64_col"]) + assert np.allclose(df_read["float_col"], df_sorted["float_col"]) + assert np.allclose(df_read["double_col"], df_sorted["double_col"]) + assert df_read["string_col"].equals(df_sorted["string_col"]) + for i in range(50): + assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i] + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_default_table_name(): + tsfile_path = "test_dataframe_to_tsfile_default_name.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [i for i in range(10)], + 'value': [i * 1.0 for i in range(10)] + }) + + dataframe_to_tsfile(df, tsfile_path) + + df_read = to_dataframe(tsfile_path, table_name="table") + assert df_read.shape == (10, 2) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_case_insensitive_time(): + tsfile_path = "test_dataframe_to_tsfile_case_time.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'Time': [i for i in range(20)], + 'value': [i * 2.0 for i in range(20)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table") + + df_read = to_dataframe(tsfile_path, table_name="test_table") + assert df_read.shape == (20, 2) + assert df_read["time"].equals(pd.Series([i for i in range(20)], dtype='Int64')) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_empty_dataframe(): + tsfile_path = "test_dataframe_to_tsfile_empty.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame() + + with pytest.raises(ValueError, match="DataFrame cannot be None or empty"): + dataframe_to_tsfile(df, tsfile_path) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_no_data_columns(): + tsfile_path = "test_dataframe_to_tsfile_no_data.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [i for i in range(10)] + }) + + with pytest.raises(ValueError, match="DataFrame must have at least one data column"): + dataframe_to_tsfile(df, tsfile_path) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_invalid_time_column(): + tsfile_path = "test_dataframe_to_tsfile_invalid_time.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'timestamp': [i for i in range(10)], + 'value': [i * 1.0 for i in range(10)] + }) + + with pytest.raises(ValueError, match="Time column 'time' not found"): + dataframe_to_tsfile(df, tsfile_path, time_column="time") + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_non_integer_time_column(): + tsfile_path = "test_dataframe_to_tsfile_non_int_time.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [f"time{i}" for i in range(10)], + 'value': [i * 1.0 for i in range(10)] + }) + + with pytest.raises(TypeError, match="must be integer type"): + dataframe_to_tsfile(df, tsfile_path) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_invalid_tag_column(): + tsfile_path = "test_dataframe_to_tsfile_invalid_tag.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [i for i in range(10)], + 'value': [i * 1.0 for i in range(10)] + }) + + with pytest.raises(ValueError, match="Tag column 'invalid' not found"): + dataframe_to_tsfile(df, tsfile_path, tag_column=["invalid"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_string_vs_blob(): + tsfile_path = "test_dataframe_to_tsfile_string_blob.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [i for i in range(20)], + 'string_col': [f"str{i}" for i in range(20)], + 'blob_col': [f"blob{i}".encode('utf-8') for i in range(20)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table") + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + + assert df_read["string_col"].equals(df_sorted["string_col"]) + for i in range(20): + assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i] + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) diff --git a/python/tests/test_write_and_read.py b/python/tests/test_write_and_read.py index b327e2d3d..1ffc22b99 100644 --- a/python/tests/test_write_and_read.py +++ b/python/tests/test_write_and_read.py @@ -16,11 +16,13 @@ # under the License. # +import os from datetime import date import numpy as np import pandas as pd import pytest +from pandas import Float64Dtype from pandas.core.dtypes.common import is_integer_dtype from tsfile import ColumnSchema, TableSchema, TSEncoding @@ -31,7 +33,7 @@ from tsfile import TsFileTableWriter from tsfile import TsFileWriter, TsFileReader, ColumnCategory from tsfile import to_dataframe -from tsfile.exceptions import TableNotExistError, ColumnNotExistError, NotSupportedError +from tsfile.exceptions import TableNotExistError, ColumnNotExistError, NotSupportedError, TypeMismatchError def test_row_record_write_and_read(): @@ -544,7 +546,7 @@ def test_tsfile_to_df(): assert df1.shape == (4097, 4) assert df1["value2"].sum() == 100 * (1 + 4096) / 2 * 4096 assert is_integer_dtype(df1["time"]) - assert df1["value"].dtype == np.float64 + assert df1["value"].dtype == Float64Dtype() assert is_integer_dtype(df1["value2"]) df2 = to_dataframe("table_write_to_df.tsfile", column_names=["device", "value2"]) assert df2.shape == (4097, 3) @@ -755,237 +757,9 @@ def test_tree_all_datatype_query_to_dataframe_variants(): pass finally: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - - -def test_table_all_datatype_query_to_dataframe_variants(): - tsfile_path = "test_table.tsfile" - table = TableSchema( - "test_table", - [ - ColumnSchema("Device1", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("Device2", TSDataType.STRING, ColumnCategory.TAG), - ColumnSchema("Value1", TSDataType.BOOLEAN, ColumnCategory.FIELD), - ColumnSchema("Value2", TSDataType.INT32, ColumnCategory.FIELD), - ColumnSchema("Value3", TSDataType.INT64, ColumnCategory.FIELD), - ColumnSchema("Value4", TSDataType.FLOAT, ColumnCategory.FIELD), - ColumnSchema("Value5", TSDataType.DOUBLE, ColumnCategory.FIELD), - ColumnSchema("Value6", TSDataType.TEXT, ColumnCategory.FIELD), - ColumnSchema("Value7", TSDataType.STRING, ColumnCategory.FIELD), - ColumnSchema("Value8", TSDataType.BLOB, ColumnCategory.FIELD), - ColumnSchema("Value9", TSDataType.TIMESTAMP, ColumnCategory.FIELD), - ColumnSchema("Value10", TSDataType.DATE, ColumnCategory.FIELD), - ], - ) - dateSet = set() - try: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - max_row_num = 100 - with TsFileTableWriter(tsfile_path, table) as writer: - tablet = Tablet( - [ - "Device1", - "Device2", - "Value1", - "Value2", - "Value3", - "Value4", - "Value5", - "Value6", - "Value7", - "Value8", - "Value9", - "Value10", - ], - [ - TSDataType.STRING, - TSDataType.STRING, - TSDataType.BOOLEAN, - TSDataType.INT32, - TSDataType.INT64, - TSDataType.FLOAT, - TSDataType.DOUBLE, - TSDataType.TEXT, - TSDataType.STRING, - TSDataType.BLOB, - TSDataType.TIMESTAMP, - TSDataType.DATE, - ], - max_row_num, - ) - for i in range(max_row_num): - tablet.add_timestamp(i, i) - tablet.add_value_by_name("Device1", i, "d1_" + str(i)) - tablet.add_value_by_name("Device2", i, "d2_" + str(i)) - tablet.add_value_by_name("Value1", i, i % 2 == 0) - tablet.add_value_by_name("Value2", i, i * 3) - tablet.add_value_by_name("Value3", i, i * 4) - tablet.add_value_by_name("Value4", i, i * 5.5) - tablet.add_value_by_name("Value5", i, i * 6.6) - tablet.add_value_by_name("Value6", i, f"string_value_{i}") - tablet.add_value_by_name("Value7", i, f"text_value_{i}") - tablet.add_value_by_name("Value8", i, f"blob_data_{i}".encode('utf-8')) - tablet.add_value_by_name("Value9", i, i * 9) - tablet.add_value_by_name("Value10", i, date(2025, 1, i % 20 + 1)) - dateSet.add(date(2025, 1, i % 20 + 1)) - writer.write_table(tablet) - - df1_1 = to_dataframe(tsfile_path) - assert df1_1.shape[0] == max_row_num - for i in range(max_row_num): - assert df1_1.iloc[i, 1] == "d1_" + str(df1_1.iloc[i, 0]) - assert df1_1.iloc[i, 2] == "d2_" + str(df1_1.iloc[i, 0]) - - df2_1 = to_dataframe(tsfile_path, column_names=["Value1"]) - for i in range(max_row_num): - assert df2_1.iloc[i, 1] == np.bool_(df2_1.iloc[i, 0] % 2 == 0) - df2_2 = to_dataframe(tsfile_path, column_names=["Value2"]) - for i in range(max_row_num): - assert df2_2.iloc[i, 1] == np.int32(df2_2.iloc[i, 0] * 3) - df2_3 = to_dataframe(tsfile_path, column_names=["Value3"]) - for i in range(max_row_num): - assert df2_3.iloc[i, 1] == np.int64(df2_3.iloc[i, 0] * 4) - df2_4 = to_dataframe(tsfile_path, column_names=["Value4"]) - for i in range(max_row_num): - assert df2_4.iloc[i, 1] == np.float32(df2_4.iloc[i, 0] * 5.5) - df2_5 = to_dataframe(tsfile_path, column_names=["Value5"]) - for i in range(max_row_num): - assert df2_5.iloc[i, 1] == np.float64(df2_5.iloc[i, 0] * 6.6) - df2_6 = to_dataframe(tsfile_path, column_names=["Value6"]) - for i in range(max_row_num): - assert df2_6.iloc[i, 1] == f"string_value_{df2_6.iloc[i, 0]}" - df2_7 = to_dataframe(tsfile_path, column_names=["Value7"]) - for i in range(max_row_num): - assert df2_7.iloc[i, 1] == f"text_value_{df2_7.iloc[i, 0]}" - df2_8 = to_dataframe(tsfile_path, column_names=["Value8"]) - for i in range(max_row_num): - assert df2_8.iloc[i, 1] == f"blob_data_{df2_8.iloc[i, 0]}".encode('utf-8') - df2_9 = to_dataframe(tsfile_path, column_names=["Value9"]) - for i in range(max_row_num): - assert df2_9.iloc[i, 1] == np.int64(df2_9.iloc[i, 0] * 9) - df2_10 = to_dataframe(tsfile_path, column_names=["Value10"]) - for i in range(max_row_num): - assert df2_10.iloc[i, 1] in dateSet - df2_11 = to_dataframe(tsfile_path, column_names=["Device1", "Value1"]) - for i in range(max_row_num): - assert df2_11.iloc[i, 1] == "d1_" + str(df2_11.iloc[i, 0]) - assert df2_11.iloc[i, 2] == np.bool_(df2_11.iloc[i, 0] % 2 == 0) - df2_12 = to_dataframe( - tsfile_path, - column_names=[ - "Device1", - "Device2", - "Value1", - "Value2", - "Value3", - "Value4", - "Value5", - "Value6", - "Value7", - "Value8", - "Value9", - "Value10", - ], - ) - for i in range(max_row_num): - assert df2_12.iloc[i, 1] == "d1_" + str(df2_12.iloc[i, 0]) - assert df2_12.iloc[i, 2] == "d2_" + str(df2_12.iloc[i, 0]) - assert df2_12.iloc[i, 3] == np.bool_(df2_12.iloc[i, 0] % 2 == 0) - assert df2_12.iloc[i, 4] == np.int32(df2_12.iloc[i, 0] * 3) - assert df2_12.iloc[i, 5] == np.int64(df2_12.iloc[i, 0] * 4) - assert df2_12.iloc[i, 6] == np.float32(df2_12.iloc[i, 0] * 5.5) - assert df2_12.iloc[i, 7] == np.float64(df2_12.iloc[i, 0] * 6.6) - assert df2_12.iloc[i, 8] == f"string_value_{df2_12.iloc[i, 0]}" - assert df2_12.iloc[i, 9] == f"text_value_{df2_12.iloc[i, 0]}" - assert df2_12.iloc[i, 10] == f"blob_data_{df2_12.iloc[i, 0]}".encode( - "utf-8" - ) - assert df2_12.iloc[i, 11] == np.int64(df2_12.iloc[i, 0] * 9) - assert df2_12.iloc[i, 12] == date(2025, 1, df2_12.iloc[i, 0] % 20 + 1) - df2_13 = to_dataframe( - tsfile_path, column_names=["Device1", "Device2", "Value1"] - ) - for i in range(max_row_num): - assert df2_13.iloc[i, 1] == "d1_" + str(df2_13.iloc[i, 0]) - assert df2_13.iloc[i, 2] == "d2_" + str(df2_13.iloc[i, 0]) - assert df2_13.iloc[i, 3] == np.bool_(df2_13.iloc[i, 0] % 2 == 0) - - df3_1 = to_dataframe(tsfile_path, table_name="test_table") - assert df3_1.shape[0] == max_row_num - assert df3_1.iloc[0, 0] == 0 - df3_2 = to_dataframe(tsfile_path, table_name="TEST_TABLE") - assert df3_2.shape[0] == max_row_num - assert df3_2.iloc[0, 0] == 0 - - df4_1 = to_dataframe(tsfile_path, start_time=10) - assert df4_1.shape[0] == 90 - df4_2 = to_dataframe(tsfile_path, start_time=-10) - assert df4_2.shape[0] == max_row_num - df4_3 = to_dataframe(tsfile_path, end_time=5) - assert df4_3.shape[0] == 6 - df4_4 = to_dataframe(tsfile_path, end_time=-5) - assert df4_4.shape[0] == 0 - df4_5 = to_dataframe(tsfile_path, start_time=5, end_time=5) - assert df4_5.shape[0] == 1 - df4_6 = to_dataframe(tsfile_path, start_time=-5, end_time=-5) - assert df4_6.shape[0] == 0 - df4_7 = to_dataframe(tsfile_path, start_time=10, end_time=-10) - assert df4_7.shape[0] == 0 - df4_8 = to_dataframe(tsfile_path, start_time=-10, end_time=10) - assert df4_8.shape[0] == 11 - df4_8 = to_dataframe(tsfile_path, start_time=-50, end_time=50) - assert df4_8.shape[0] == 51 - - df5_1 = to_dataframe(tsfile_path, max_row_num=1) - assert df5_1.shape[0] == 1 - df5_2 = to_dataframe(tsfile_path, max_row_num=50) - assert df5_2.shape[0] == 50 - df5_3 = to_dataframe(tsfile_path, max_row_num=100) - assert df5_3.shape[0] == 100 - df5_4 = to_dataframe(tsfile_path, max_row_num=1000) - assert df5_4.shape[0] == 100 - df5_5 = to_dataframe(tsfile_path, max_row_num=0) - assert df5_5.shape[0] == 0 - df5_6 = to_dataframe(tsfile_path, max_row_num=-10) - assert df5_6.shape[0] == 0 - - for df6_1 in to_dataframe(tsfile_path, max_row_num=20, as_iterator=True): - assert df6_1.shape[0] == 20 - for df6_2 in to_dataframe(tsfile_path, max_row_num=1000, as_iterator=True): - assert df6_2.shape[0] == 100 - - for df7_1 in to_dataframe( - tsfile_path, - table_name="test_table", - column_names=["Device1", "Value1"], - start_time=21, - end_time=50, - max_row_num=10, - as_iterator=True, - ): - assert df7_1.shape[0] == 10 - for i in range(30): - assert df2_11.iloc[i, 1] == "d1_" + str(df2_11.iloc[i, 0]) - assert df2_11.iloc[i, 2] == np.bool_(df2_11.iloc[i, 0] % 2 == 0) - - try: - to_dataframe(tsfile_path, table_name="non_existent_table") - except TableNotExistError as e: - assert e.args[0] == "[non_existent_table] Requested table does not exist" - - try: - to_dataframe(tsfile_path, column_names=["non_existent_column"]) - except ColumnNotExistError as e: - assert e.args[0] == "[non_existent_column] Column does not exist" - - finally: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - + if os.path.exists("tablet_write_and_read.tsfile"): + os.remove("tablet_write_and_read.tsfile") -import os if __name__ == "__main__": os.chdir(os.path.dirname(os.path.abspath(__file__))) diff --git a/python/tsfile/__init__.py b/python/tsfile/__init__.py index bf755fcef..a9237257b 100644 --- a/python/tsfile/__init__.py +++ b/python/tsfile/__init__.py @@ -34,4 +34,4 @@ from .tsfile_writer import TsFileWriterPy as TsFileWriter from .tsfile_py_cpp import get_tsfile_config, set_tsfile_config from .tsfile_table_writer import TsFileTableWriter -from .utils import to_dataframe \ No newline at end of file +from .utils import to_dataframe, dataframe_to_tsfile \ No newline at end of file diff --git a/python/tsfile/constants.py b/python/tsfile/constants.py index 7d1f5ff5c..d4f87200d 100644 --- a/python/tsfile/constants.py +++ b/python/tsfile/constants.py @@ -16,6 +16,7 @@ # under the License. # from enum import unique, IntEnum +import numpy as np @unique @@ -62,13 +63,13 @@ def to_pandas_dtype(self): elif self == TSDataType.INT64: return "Int64" elif self == TSDataType.FLOAT: - return "float32" + return "Float32" elif self == TSDataType.DOUBLE: - return "float64" + return "Float64" elif self == TSDataType.TEXT or self == TSDataType.STRING: return "object" elif self == TSDataType.TIMESTAMP: - return "int64" + return "Int64" elif self == TSDataType.DATE: return "object" elif self == TSDataType.BLOB: @@ -76,6 +77,79 @@ def to_pandas_dtype(self): else: raise ValueError(f"Unknown data type: {self}") + @classmethod + def from_pandas_datatype(cls, dtype): + if dtype is np.bool_: + return cls.BOOLEAN + elif dtype is np.int32: + return cls.INT32 + elif dtype is np.int64: + return cls.INT64 + elif dtype is np.float32: + return cls.FLOAT + elif dtype is np.float64: + return cls.DOUBLE + elif dtype is np.object_: + return cls.STRING + + try: + import pandas as pd + if hasattr(pd, 'StringDtype') and isinstance(dtype, pd.StringDtype): + return cls.STRING + except (ImportError, AttributeError): + pass + + if hasattr(dtype, 'type'): + dtype = dtype.type + if dtype is np.bool_: + return cls.BOOLEAN + elif dtype is np.int32: + return cls.INT32 + elif dtype is np.int64: + return cls.INT64 + elif dtype is np.float32: + return cls.FLOAT + elif dtype is np.float64: + return cls.DOUBLE + elif dtype is np.object_: + return cls.STRING + + dtype_str = str(dtype) + + if 'stringdtype' in dtype_str.lower() or dtype_str.startswith('string'): + return cls.STRING + + dtype_map = { + 'bool': cls.BOOLEAN, + 'boolean': cls.BOOLEAN, + 'int32': cls.INT32, + 'Int32': cls.INT32, + 'int64': cls.INT64, + 'Int64': cls.INT64, + 'float32': cls.FLOAT, + 'float64': cls.DOUBLE, + 'bytes': cls.BLOB, + 'object': cls.STRING, + 'string': cls.STRING, + } + + if dtype_str in dtype_map: + return dtype_map[dtype_str] + + dtype_lower = dtype_str.lower() + if dtype_lower in dtype_map: + return dtype_map[dtype_lower] + + if 'object_' in dtype_lower or dtype_str == "": + return cls.STRING + + if dtype_str.startswith('datetime64'): + return cls.TIMESTAMP + + return cls.STRING + + + @unique class TSEncoding(IntEnum): diff --git a/python/tsfile/tsfile_cpp.pxd b/python/tsfile/tsfile_cpp.pxd index 40bff4eba..ab915fefe 100644 --- a/python/tsfile/tsfile_cpp.pxd +++ b/python/tsfile/tsfile_cpp.pxd @@ -137,7 +137,8 @@ cdef extern from "./tsfile_cwrapper.h": TSDataType * data_types, int column_num, int max_rows); - Tablet tablet_new(const char** column_names, TSDataType * data_types, int column_num); + Tablet tablet_new(char** column_name_list, TSDataType* data_types, + uint32_t column_num, uint32_t max_rows); ErrorCode tablet_add_timestamp(Tablet tablet, uint32_t row_index, int64_t timestamp); ErrorCode tablet_add_value_by_index_int64_t(Tablet tablet, uint32_t row_index, uint32_t column_index, diff --git a/python/tsfile/tsfile_py_cpp.pxd b/python/tsfile/tsfile_py_cpp.pxd index e44bb588d..9ce2f90da 100644 --- a/python/tsfile/tsfile_py_cpp.pxd +++ b/python/tsfile/tsfile_py_cpp.pxd @@ -33,6 +33,7 @@ cdef public api DeviceSchema* to_c_device_schema(object py_schema) cdef public api ColumnSchema* to_c_column_schema(object py_schema) cdef public api TableSchema* to_c_table_schema(object py_schema) cdef public api Tablet to_c_tablet(object tablet) +cdef public api Tablet dataframe_to_c_tablet(object target_name, object dataframe) cdef public api TsRecord to_c_record(object row_record) cdef public api void free_c_table_schema(TableSchema* c_schema) cdef public api void free_c_column_schema(ColumnSchema* c_schema) diff --git a/python/tsfile/tsfile_py_cpp.pyx b/python/tsfile/tsfile_py_cpp.pyx index d9924d7ad..851346853 100644 --- a/python/tsfile/tsfile_py_cpp.pyx +++ b/python/tsfile/tsfile_py_cpp.pyx @@ -19,6 +19,9 @@ from .date_utils import parse_date_to_int from .tsfile_cpp cimport * +import pandas as pd +import numpy as np + from libc.stdlib cimport free from libc.stdlib cimport malloc from libc.string cimport strdup @@ -220,7 +223,7 @@ cdef Tablet to_c_tablet(object tablet): cdef TSDataType * column_types cdef bytes row_bytes cdef char *raw_str - cdef const char* str_ptr + cdef const char * str_ptr cdef Py_ssize_t raw_len if tablet.get_target_name() is not None: @@ -293,7 +296,7 @@ cdef Tablet to_c_tablet(object tablet): for row in range(max_row_num): if value[row] is not None: py_value = value[row] - str_ptr = PyUnicode_AsUTF8AndSize(py_value, &raw_len) + str_ptr = PyUnicode_AsUTF8AndSize(py_value, &raw_len) tablet_add_value_by_index_string_with_len(ctablet, row, col, str_ptr, raw_len) elif data_type == TS_DATATYPE_BLOB: @@ -304,13 +307,168 @@ cdef Tablet to_c_tablet(object tablet): return ctablet +cdef TSDataType pandas_dtype_to_ts_data_type(object dtype): + return to_c_data_type(TSDataTypePy.from_pandas_datatype(dtype)) + +cdef TSDataType check_string_or_blob(TSDataType ts_data_type, object dtype, object column_series): + if ts_data_type == TS_DATATYPE_STRING: + dtype_str = str(dtype) + if dtype == 'object' or dtype_str == "": + first_valid_idx = column_series.first_valid_index() + if first_valid_idx is not None: + first_value = column_series[first_valid_idx] + if isinstance(first_value, bytes): + return TS_DATATYPE_BLOB + return ts_data_type + +cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): + cdef Tablet ctablet + cdef int max_row_num + cdef TSDataType data_type + cdef int64_t timestamp + cdef const char * device_id_c = NULL + cdef char** columns_names + cdef TSDataType * columns_types + cdef char *raw_str + cdef const char * str_ptr + cdef Py_ssize_t raw_len + cdef int column_num + cdef int i, row + cdef object value + cdef object py_value + cdef object value_bytes + + device_id_bytes = PyUnicode_AsUTF8String(target_name.lower()) + device_id_c = device_id_bytes + df_columns = list(dataframe.columns) + use_id_as_time = False + time_column_name = None + + for col in df_columns: + if col.lower() == 'time': + time_column_name = col + break + + if time_column_name is None: + use_id_as_time = True + + data_columns = [col for col in df_columns if col.lower() != 'time'] + column_num = len(data_columns) + + if column_num == 0: + raise ValueError("DataFrame must have at least one data column besides 'time'") + + max_row_num = len(dataframe) + + column_types_list = [] + for col_name in data_columns: + pandas_dtype = dataframe[col_name].dtype + ds_type = pandas_dtype_to_ts_data_type(pandas_dtype) + ds_type = check_string_or_blob(ds_type, pandas_dtype, dataframe[col_name]) + column_types_list.append(ds_type) + + columns_names = malloc(sizeof(char *) * column_num) + columns_types = malloc(sizeof(TSDataType) * column_num) + + for i in range(column_num): + columns_names[i] = strdup(data_columns[i].lower().encode('utf-8')) + columns_types[i] = column_types_list[i] + + ctablet = _tablet_new_with_target_name(device_id_c, columns_names, columns_types, column_num, + max_row_num) + + free(columns_types) + for i in range(column_num): + free(columns_names[i]) + free(columns_names) + + if use_id_as_time: + for row in range(max_row_num): + timestamp_py = dataframe.index[row] + if pd.isna(timestamp_py): + continue + timestamp = timestamp_py + tablet_add_timestamp(ctablet, row, timestamp) + else: + time_values = dataframe[time_column_name].values + for row in range(max_row_num): + timestamp_py = time_values[row] + if pd.isna(timestamp_py): + continue + timestamp = timestamp_py + tablet_add_timestamp(ctablet, row, timestamp) + + for col in range(column_num): + col_name = data_columns[col] + data_type = column_types_list[col] + column_values = dataframe[col_name].values + + # BOOLEAN + if data_type == TS_DATATYPE_BOOLEAN: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + tablet_add_value_by_index_bool(ctablet, row, col, value) + # INT32 + elif data_type == TS_DATATYPE_INT32: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + tablet_add_value_by_index_int32_t(ctablet, row, col, value) + # INT64 + elif data_type == TS_DATATYPE_INT64 or data_type == TS_DATATYPE_TIMESTAMP: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + tablet_add_value_by_index_int64_t(ctablet, row, col, value) + # FLOAT + elif data_type == TS_DATATYPE_FLOAT: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + tablet_add_value_by_index_float(ctablet, row, col, value) + # DOUBLE + elif data_type == TS_DATATYPE_DOUBLE: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + tablet_add_value_by_index_double(ctablet, row, col, value) + # DATE + elif data_type == TS_DATATYPE_DATE: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + tablet_add_value_by_index_int32_t(ctablet, row, col, parse_date_to_int(value)) + # STRING or TEXT + elif data_type == TS_DATATYPE_STRING or data_type == TS_DATATYPE_TEXT: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + py_value = str(value) + str_ptr = PyUnicode_AsUTF8AndSize(py_value, &raw_len) + tablet_add_value_by_index_string_with_len(ctablet, row, col, str_ptr, raw_len) + # BLOB + elif data_type == TS_DATATYPE_BLOB: + for row in range(max_row_num): + value = column_values[row] + if not pd.isna(value): + if isinstance(value, bytes): + PyBytes_AsStringAndSize(value, &raw_str, &raw_len) + tablet_add_value_by_index_string_with_len(ctablet, row, col, raw_str, raw_len) + else: + value_bytes = bytes(value) + PyBytes_AsStringAndSize(value_bytes, &raw_str, &raw_len) + tablet_add_value_by_index_string_with_len(ctablet, row, col, raw_str, raw_len) + + return ctablet + cdef TsRecord to_c_record(object row_record): cdef int field_num = row_record.get_fields_num() cdef int64_t timestamp = row_record.get_timestamp() cdef bytes device_id_bytes = PyUnicode_AsUTF8String(row_record.get_device_id()) - cdef const char* device_id = device_id_bytes - cdef const char* str_ptr - cdef char* blob_ptr + cdef const char * device_id = device_id_bytes + cdef const char * str_ptr + cdef char * blob_ptr cdef Py_ssize_t str_len cdef TsRecord record cdef int i @@ -320,9 +478,11 @@ cdef TsRecord to_c_record(object row_record): field = row_record.get_fields()[i] data_type = to_c_data_type(field.get_data_type()) if data_type == TS_DATATYPE_BOOLEAN: - _insert_data_into_ts_record_by_name_bool(record, PyUnicode_AsUTF8(field.get_field_name()), field.get_bool_value()) + _insert_data_into_ts_record_by_name_bool(record, PyUnicode_AsUTF8(field.get_field_name()), + field.get_bool_value()) elif data_type == TS_DATATYPE_INT32 or data_type == TS_DATATYPE_DATE: - _insert_data_into_ts_record_by_name_int32_t(record, PyUnicode_AsUTF8(field.get_field_name()), field.get_int_value()) + _insert_data_into_ts_record_by_name_int32_t(record, PyUnicode_AsUTF8(field.get_field_name()), + field.get_int_value()) elif data_type == TS_DATATYPE_INT64: _insert_data_into_ts_record_by_name_int64_t(record, PyUnicode_AsUTF8(field.get_field_name()), field.get_long_value()) @@ -333,15 +493,17 @@ cdef TsRecord to_c_record(object row_record): _insert_data_into_ts_record_by_name_double(record, PyUnicode_AsUTF8(field.get_field_name()), field.get_double_value()) elif data_type == TS_DATATYPE_FLOAT: - _insert_data_into_ts_record_by_name_float(record, PyUnicode_AsUTF8(field.get_field_name()), field.get_float_value()) + _insert_data_into_ts_record_by_name_float(record, PyUnicode_AsUTF8(field.get_field_name()), + field.get_float_value()) elif data_type == TS_DATATYPE_TEXT or data_type == TS_DATATYPE_STRING: - str_ptr = PyUnicode_AsUTF8AndSize(field.get_string_value(), &str_len) - _insert_data_into_ts_record_by_name_string_with_len(record, PyUnicode_AsUTF8(field.get_field_name()), str_ptr, str_len) + str_ptr = PyUnicode_AsUTF8AndSize(field.get_string_value(), &str_len) + _insert_data_into_ts_record_by_name_string_with_len(record, PyUnicode_AsUTF8(field.get_field_name()), + str_ptr, str_len) elif data_type == TS_DATATYPE_BLOB: if PyBytes_AsStringAndSize(field.get_string_value(), &blob_ptr, &str_len) < 0: raise ValueError("blob not legal") _insert_data_into_ts_record_by_name_string_with_len(record, PyUnicode_AsUTF8(field.get_field_name()), - blob_ptr, str_len) + blob_ptr, str_len) return record # Free c structs' space diff --git a/python/tsfile/tsfile_table_writer.py b/python/tsfile/tsfile_table_writer.py index 281933606..56f9c3417 100644 --- a/python/tsfile/tsfile_table_writer.py +++ b/python/tsfile/tsfile_table_writer.py @@ -15,9 +15,21 @@ # specific language governing permissions and limitations # under the License. # +import pandas as pd from tsfile import TableSchema, Tablet, TableNotExistError from tsfile import TsFileWriter +from tsfile.constants import TSDataType +from tsfile.exceptions import ColumnNotExistError, TypeMismatchError + +def check_string_or_blob(ts_data_type: TSDataType, dtype, column_series: pd.Series) -> TSDataType: + if ts_data_type == TSDataType.STRING and (dtype == 'object' or str(dtype) == ""): + first_valid_idx = column_series.first_valid_index() + if first_valid_idx is not None: + first_value = column_series[first_valid_idx] + if isinstance(first_value, bytes): + return TSDataType.BLOB + return ts_data_type class TsFileTableWriter: @@ -31,7 +43,7 @@ class TsFileTableWriter: according to that schema, and serialize this data into a TsFile. """ - def __init__(self, path: str, table_schema: TableSchema, memory_threshold = 128 * 1024 * 1024): + def __init__(self, path: str, table_schema: TableSchema, memory_threshold=128 * 1024 * 1024): """ :param path: The path of tsfile, will create if it doesn't exist. :param table_schema: describes the schema of the tables they want to write. @@ -39,7 +51,7 @@ def __init__(self, path: str, table_schema: TableSchema, memory_threshold = 128 """ self.writer = TsFileWriter(path, memory_threshold) self.writer.register_table(table_schema) - self.exclusive_table_name_ = table_schema.get_table_name() + self.tableSchema = table_schema def write_table(self, tablet: Tablet): """ @@ -49,11 +61,66 @@ def write_table(self, tablet: Tablet): :raise: TableNotExistError if table does not exist or tablet's table_name does not match tableschema. """ if tablet.get_target_name() is None: - tablet.set_table_name(self.exclusive_table_name_) - elif self.exclusive_table_name_ is not None and tablet.get_target_name() != self.exclusive_table_name_: + tablet.set_table_name(self.tableSchema.get_table_name()) + elif (self.tableSchema.get_table_name() is not None + and tablet.get_target_name() != self.tableSchema.get_table_name()): raise TableNotExistError self.writer.write_table(tablet) + def write_dataframe(self, dataframe: pd.DataFrame): + """ + Write a pandas DataFrame into table in tsfile. + :param dataframe: pandas DataFrame with 'time' column and data columns matching schema. + :return: no return value. + :raise: ValueError if dataframe is None or is empty. + :raise: ColumnNotExistError if DataFrame columns don't match schema. + :raise: TypeMismatchError if DataFrame column types are incompatible with schema. + """ + if dataframe is None or dataframe.empty: + raise ValueError("DataFrame cannot be None or empty") + + # Create mapping from lowercase column name to original column name + df_column_name_map = {col.lower(): col for col in dataframe.columns if col.lower() != 'time'} + df_columns = list(df_column_name_map.keys()) + + schema_column_names = set(self.tableSchema.get_column_names()) + df_columns_set = set(df_columns) + + extra_columns = df_columns_set - schema_column_names + if extra_columns: + raise ColumnNotExistError( + code=50, + context=f"DataFrame has columns not in schema: {', '.join(sorted(extra_columns))}" + ) + + schema_column_map = { + col.get_column_name(): col for col in self.tableSchema.get_columns() + } + + type_mismatches = [] + for col_name in df_columns: + df_col_name_original = df_column_name_map[col_name] + + df_dtype = dataframe[df_col_name_original].dtype + df_ts_type = TSDataType.from_pandas_datatype(df_dtype) + df_ts_type = check_string_or_blob(df_ts_type, df_dtype, dataframe[df_col_name_original]) + + schema_col = schema_column_map[col_name] + expected_ts_type = schema_col.get_data_type() + + if df_ts_type != expected_ts_type: + type_mismatches.append( + f"Column '{col_name}': expected {expected_ts_type.name}, got {df_ts_type.name}" + ) + + if type_mismatches: + raise TypeMismatchError( + code=27, + context=f"Type mismatches: {'; '.join(type_mismatches)}" + ) + + self.writer.write_dataframe(self.tableSchema.get_table_name(), dataframe) + def close(self): """ Close TsFileTableWriter and will flush data automatically. diff --git a/python/tsfile/tsfile_writer.pyx b/python/tsfile/tsfile_writer.pyx index 201991952..c558984e1 100644 --- a/python/tsfile/tsfile_writer.pyx +++ b/python/tsfile/tsfile_writer.pyx @@ -15,21 +15,21 @@ # specific language governing permissions and limitations # under the License. # - -#cython: language_level=3 - -from .tsfile_cpp cimport * -from .tsfile_py_cpp cimport * +import pandas from tsfile.row_record import RowRecord -from tsfile.schema import TimeseriesSchema as TimeseriesSchemaPy, DeviceSchema as DeviceSchemaPy from tsfile.schema import TableSchema as TableSchemaPy +from tsfile.schema import TimeseriesSchema as TimeseriesSchemaPy, DeviceSchema as DeviceSchemaPy from tsfile.tablet import Tablet as TabletPy +from .tsfile_cpp cimport * +from .tsfile_py_cpp cimport * + +#cython: language_level=3 cdef class TsFileWriterPy: cdef TsFileWriter writer - def __init__(self, pathname:str, memory_threshold:int = 128 * 1024 * 1024): + def __init__(self, pathname: str, memory_threshold: int = 128 * 1024 * 1024): self.writer = tsfile_writer_new_c(pathname, memory_threshold) def register_timeseries(self, device_name : str, timeseries_schema : TimeseriesSchemaPy): @@ -38,7 +38,7 @@ cdef class TsFileWriterPy: device_name: device name of the timeseries timeseries_schema: measurement's name/datatype/encoding/compressor """ - cdef TimeseriesSchema* c_schema = to_c_timeseries_schema(timeseries_schema) + cdef TimeseriesSchema * c_schema = to_c_timeseries_schema(timeseries_schema) cdef ErrorCode errno try: errno = tsfile_writer_register_timeseries_py_cpp(self.writer, device_name, c_schema) @@ -51,7 +51,7 @@ cdef class TsFileWriterPy: Register a device with tsfile writer. device_schema: the device definition, including device_name, some measurements' schema. """ - cdef DeviceSchema* device_schema_c = to_c_device_schema(device_schema) + cdef DeviceSchema * device_schema_c = to_c_device_schema(device_schema) cdef ErrorCode errno try: errno = tsfile_writer_register_device_py_cpp(self.writer, device_schema_c) @@ -64,7 +64,7 @@ cdef class TsFileWriterPy: Register a table with tsfile writer. table_schema: the table definition, include table_name, columns' schema. """ - cdef TableSchema* c_schema = to_c_table_schema(table_schema) + cdef TableSchema * c_schema = to_c_table_schema(table_schema) cdef ErrorCode errno try: errno = tsfile_writer_register_table_py_cpp(self.writer, c_schema) @@ -86,6 +86,15 @@ cdef class TsFileWriterPy: finally: free_c_tablet(ctablet) + def write_dataframe(self, target_table: str, dataframe: pandas.DataFrame): + cdef Tablet ctablet = dataframe_to_c_tablet(target_table, dataframe) + cdef ErrorCode errno + try: + errno = _tsfile_writer_write_table(self.writer, ctablet) + check_error(errno) + finally: + free_c_tablet(ctablet) + def write_row_record(self, record : RowRecord): """ Write a record into tsfile with tsfile writer. @@ -143,4 +152,3 @@ cdef class TsFileWriterPy: def __exit__(self, exc_type, exc_val, exc_tb): self.close() - diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index d27a0fae3..f3c2adc57 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py @@ -20,9 +20,12 @@ import numpy as np import pandas as pd +from pandas.core.dtypes.common import is_integer_dtype +from tsfile import ColumnSchema, TableSchema, ColumnCategory, TSDataType from tsfile.exceptions import TableNotExistError, ColumnNotExistError from tsfile.tsfile_reader import TsFileReaderPy +from tsfile.tsfile_table_writer import TsFileTableWriter, check_string_or_blob def to_dataframe(file_path: str, @@ -159,3 +162,106 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: return df else: return pd.DataFrame() + + +def dataframe_to_tsfile(dataframe: pd.DataFrame, + file_path: str, + table_name: Optional[str] = None, + time_column: Optional[str] = None, + tag_column: Optional[list[str]] = None, + ): + """ + Write a pandas DataFrame to a TsFile by inferring the table schema from the DataFrame. + + This function automatically infers the table schema based on the DataFrame's column + names and data types, then writes the data to a TsFile. + + Parameters + ---------- + dataframe : pd.DataFrame + The pandas DataFrame to write to TsFile. + - If a 'time' column (case-insensitive) exists, it will be used as the time column. + - Otherwise, the DataFrame index will be used as timestamps. + - All other columns will be treated as data columns. + + file_path : str + Path to the TsFile to write. Will be created if it doesn't exist. + + table_name : Optional[str], default None + Name of the table. If None, defaults to "table". + + time_column : Optional[str], default None + Name of the time column. If None, will look for a column named 'time' (case-insensitive), + or use the DataFrame index if no 'time' column is found. + + tag_column : Optional[list[str]], default None + List of column names to be treated as TAG columns. All other columns will be FIELD columns. + If None, all columns are treated as FIELD columns. + + Returns + ------- + None + + Raises + ------ + ValueError + If the DataFrame is empty or has no data columns. + """ + if dataframe is None or dataframe.empty: + raise ValueError("DataFrame cannot be None or empty") + + if table_name is None: + table_name = "table" + + time_col_name = None + if time_column is not None: + if time_column not in dataframe.columns: + raise ValueError(f"Time column '{time_column}' not found in DataFrame") + if not is_integer_dtype(dataframe[time_column].dtype): + raise TypeError( + f"Time column '{time_column}' must be integer type (int64 or int), got {dataframe[time_column].dtype}") + time_col_name = time_column + else: + for col in dataframe.columns: + if col.lower() == 'time': + if is_integer_dtype(dataframe[col].dtype): + time_col_name = col + break + else: + raise TypeError( + f"Time column '{col}' must be integer type (int64 or int), got {dataframe[col].dtype}") + + data_columns = [col for col in dataframe.columns if col != time_col_name] + + if len(data_columns) == 0: + raise ValueError("DataFrame must have at least one data column besides the time column") + + tag_columns_lower = [] + if tag_column is not None: + for tag_col in tag_column: + if tag_col not in dataframe.columns: + raise ValueError(f"Tag column '{tag_col}' not found in DataFrame") + tag_columns_lower.append(tag_col.lower()) + + column_schemas = [] + for col_name in data_columns: + col_dtype = dataframe[col_name].dtype + ts_data_type = TSDataType.from_pandas_datatype(col_dtype) + ts_data_type = check_string_or_blob(ts_data_type, col_dtype, dataframe[col_name]) + + if col_name.lower() in tag_columns_lower: + category = ColumnCategory.TAG + else: + category = ColumnCategory.FIELD + + column_schemas.append(ColumnSchema(col_name, ts_data_type, category)) + + table_schema = TableSchema(table_name, column_schemas) + + if time_col_name is not None and time_col_name != 'time': + df_to_write = dataframe.rename(columns={time_col_name: 'time'}) + else: + df_to_write = dataframe + + with TsFileTableWriter(file_path, table_schema) as writer: + writer.write_dataframe(df_to_write) From 64e465a361990226c78fcfe5a188fd8c39c93063 Mon Sep 17 00:00:00 2001 From: ColinLee Date: Fri, 16 Jan 2026 11:26:31 +0800 Subject: [PATCH 02/13] fix sort data. --- python/tests/test_to_tsfile.py | 30 +++++++++++++++++++++++++++- python/tsfile/tsfile_table_writer.py | 20 ++++++++++++++++++- python/tsfile/utils.py | 6 ++++-- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py index 0928f1a94..7c1fb84ca 100644 --- a/python/tests/test_to_tsfile.py +++ b/python/tests/test_to_tsfile.py @@ -205,7 +205,7 @@ def test_dataframe_to_tsfile_default_table_name(): dataframe_to_tsfile(df, tsfile_path) - df_read = to_dataframe(tsfile_path, table_name="table") + df_read = to_dataframe(tsfile_path, table_name="test_dataframe_to_tsfile_default_name") assert df_read.shape == (10, 2) finally: if os.path.exists(tsfile_path): @@ -343,3 +343,31 @@ def test_dataframe_to_tsfile_string_vs_blob(): finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) + + +def test_dataframe_to_tsfile_tag_time_unsorted(): + tsfile_path = "test_dataframe_to_tsfile_tag_time_unsorted.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [30, 10, 20, 50, 40, 15, 25, 35, 5, 45], + 'device': ['device1', 'device1', 'device1', 'device2', 'device2', 'device1', 'device1', 'device2', + 'device1', 'device2'], + 'value': [i * 1.5 for i in range(10)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device"]) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_expected = df.sort_values(by=['device', 'time']).reset_index(drop=True) + df_expected = convert_to_nullable_types(df_expected) + + assert df_read.shape == (10, 3) + assert df_read["device"].equals(df_expected["device"]) + assert df_read["time"].equals(df_expected["time"]) + assert df_read["value"].equals(df_expected["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) diff --git a/python/tsfile/tsfile_table_writer.py b/python/tsfile/tsfile_table_writer.py index 56f9c3417..5b33f9b2f 100644 --- a/python/tsfile/tsfile_table_writer.py +++ b/python/tsfile/tsfile_table_writer.py @@ -18,7 +18,7 @@ import pandas as pd from tsfile import TableSchema, Tablet, TableNotExistError -from tsfile import TsFileWriter +from tsfile import TsFileWriter, ColumnCategory from tsfile.constants import TSDataType from tsfile.exceptions import ColumnNotExistError, TypeMismatchError @@ -119,6 +119,24 @@ def write_dataframe(self, dataframe: pd.DataFrame): context=f"Type mismatches: {'; '.join(type_mismatches)}" ) + tag_columns = [] + for col in self.tableSchema.get_columns(): + if col.get_category() == ColumnCategory.TAG: + tag_col_name = col.get_column_name() + if tag_col_name in df_column_name_map: + tag_columns.append(df_column_name_map[tag_col_name]) + + time_column = None + for col in dataframe.columns: + if col.lower() == 'time': + time_column = col + break + + if time_column: + sort_by = tag_columns.copy() + sort_by.append(time_column) + dataframe = dataframe.sort_values(by=sort_by) + self.writer.write_dataframe(self.tableSchema.get_table_name(), dataframe) def close(self): diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index f3c2adc57..567c4fe19 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # +from pathlib import Path from typing import Iterator, Union from typing import Optional @@ -188,7 +189,7 @@ def dataframe_to_tsfile(dataframe: pd.DataFrame, Path to the TsFile to write. Will be created if it doesn't exist. table_name : Optional[str], default None - Name of the table. If None, defaults to "table". + Name of the table. If None, defaults to tsfile file name. time_column : Optional[str], default None Name of the time column. If None, will look for a column named 'time' (case-insensitive), @@ -211,7 +212,8 @@ def dataframe_to_tsfile(dataframe: pd.DataFrame, raise ValueError("DataFrame cannot be None or empty") if table_name is None: - table_name = "table" + filename = Path(file_path).stem + table_name = filename time_col_name = None if time_column is not None: From e405cb35a2d8b99c19e9e1bc127f4712cc3aa52e Mon Sep 17 00:00:00 2001 From: ColinLee Date: Mon, 9 Feb 2026 02:31:13 +0800 Subject: [PATCH 03/13] tmp code. --- cpp/src/cwrapper/tsfile_cwrapper.h | 7 +- python/tests/test_dataframe.py | 101 +++++++++++++++++--- python/tests/test_write_and_read.py | 2 +- python/tsfile/constants.py | 22 ++++- python/tsfile/exceptions.py | 2 +- python/tsfile/schema.py | 44 ++++++++- python/tsfile/tsfile_cpp.pxd | 5 +- python/tsfile/tsfile_py_cpp.pxd | 2 +- python/tsfile/tsfile_py_cpp.pyx | 63 ++++++++---- python/tsfile/tsfile_reader.pyx | 12 ++- python/tsfile/tsfile_table_writer.py | 137 ++++++++++++++++----------- python/tsfile/tsfile_writer.pyx | 4 +- 12 files changed, 300 insertions(+), 101 deletions(-) diff --git a/cpp/src/cwrapper/tsfile_cwrapper.h b/cpp/src/cwrapper/tsfile_cwrapper.h index d9fe6bb85..643b4e52b 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.h +++ b/cpp/src/cwrapper/tsfile_cwrapper.h @@ -71,7 +71,12 @@ typedef enum { TS_COMPRESSION_INVALID = 255 } CompressionType; -typedef enum column_category { TAG = 0, FIELD = 1 } ColumnCategory; +typedef enum column_category { + TAG = 0, + FIELD = 1, + ATTRIBUTE = 2, + TIME = 3 +} ColumnCategory; typedef struct column_schema { char* column_name; diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 5138968a2..e3c923a3c 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -16,11 +16,11 @@ # under the License. # import os +from datetime import date import numpy as np import pandas as pd import pytest -from pandas.core.dtypes.common import is_integer_dtype from tsfile import ColumnSchema, TableSchema, TSDataType from tsfile import TsFileTableWriter, ColumnCategory @@ -59,7 +59,7 @@ def test_write_dataframe_basic(): try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - + with TsFileTableWriter(tsfile_path, table) as writer: df = pd.DataFrame({ 'time': [i for i in range(100)], @@ -90,7 +90,7 @@ def test_write_dataframe_with_index(): try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - + with TsFileTableWriter(tsfile_path, table) as writer: df = pd.DataFrame({ 'device': [f"device{i}" for i in range(50)], @@ -120,7 +120,7 @@ def test_write_dataframe_case_insensitive(): try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - + with TsFileTableWriter(tsfile_path, table) as writer: df = pd.DataFrame({ 'Time': [i for i in range(30)], # Capital T @@ -149,7 +149,7 @@ def test_write_dataframe_column_not_in_schema(): try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - + with TsFileTableWriter(tsfile_path, table) as writer: df = pd.DataFrame({ 'time': [i for i in range(10)], @@ -157,9 +157,8 @@ def test_write_dataframe_column_not_in_schema(): 'value': [i * 1.0 for i in range(10)], 'extra_column': [i for i in range(10)] # Not in schema }) - with pytest.raises(ColumnNotExistError) as exc_info: + with pytest.raises(ColumnNotExistError): writer.write_dataframe(df) - assert "extra_column" in str(exc_info.value) finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) @@ -172,15 +171,14 @@ def test_write_dataframe_type_mismatch(): try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - + with TsFileTableWriter(tsfile_path, table) as writer: df = pd.DataFrame({ 'time': [i for i in range(10)], - 'value': [i for i in range(10)] # INT64, but schema expects STRING + 'value': [i for i in range(10)] }) with pytest.raises(TypeMismatchError) as exc_info: writer.write_dataframe(df) - assert "Type mismatches" in str(exc_info.value) finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) @@ -194,12 +192,15 @@ def test_write_dataframe_all_datatypes(): ColumnSchema("float_col", TSDataType.FLOAT, ColumnCategory.FIELD), ColumnSchema("double_col", TSDataType.DOUBLE, ColumnCategory.FIELD), ColumnSchema("string_col", TSDataType.STRING, ColumnCategory.FIELD), - ColumnSchema("blob_col", TSDataType.BLOB, ColumnCategory.FIELD)]) + ColumnSchema("blob_col", TSDataType.BLOB, ColumnCategory.FIELD), + ColumnSchema("text_col", TSDataType.TEXT, ColumnCategory.FIELD), + ColumnSchema("date_col", TSDataType.DATE, ColumnCategory.FIELD), + ColumnSchema("timestamp_col", TSDataType.TIMESTAMP, ColumnCategory.FIELD)]) tsfile_path = "test_write_dataframe_all_types.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - + with TsFileTableWriter(tsfile_path, table) as writer: df = pd.DataFrame({ 'time': [i for i in range(50)], @@ -209,20 +210,27 @@ def test_write_dataframe_all_datatypes(): 'float_col': pd.Series([i * 1.5 for i in range(50)], dtype='float32'), 'double_col': [i * 2.5 for i in range(50)], 'string_col': [f"str{i}" for i in range(50)], - 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)] + 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)], + 'text_col': [f"text{i}" for i in range(50)], + 'date_col': [date(2025, i % 11 + 1, i % 20 + 1) for i in range(50)], + 'timestamp_col': [i for i in range(50)] }) writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values('time').reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) - assert df_read.shape == (50, 8) + assert df_read.shape == (50, 11) assert df_read["bool_col"].equals(df_sorted["bool_col"]) assert df_read["int32_col"].equals(df_sorted["int32_col"]) assert df_read["int64_col"].equals(df_sorted["int64_col"]) assert np.allclose(df_read["float_col"], df_sorted["float_col"]) assert np.allclose(df_read["double_col"], df_sorted["double_col"]) assert df_read["string_col"].equals(df_sorted["string_col"]) + assert df_read["blob_col"].equals(df_sorted["blob_col"]) + assert df_read["text_col"].equals(df_sorted["text_col"]) + assert df_read["date_col"].equals(df_sorted["date_col"]) + assert df_read["timestamp_col"].equals(df_sorted["timestamp_col"]) for i in range(50): assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i] finally: @@ -230,6 +238,67 @@ def test_write_dataframe_all_datatypes(): os.remove(tsfile_path) +def test_write_dataframe_schema_time_column(): + table = TableSchema("test_table", + [ColumnSchema("time", TSDataType.TIMESTAMP, ColumnCategory.TIME), + ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_schema_time.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'time': [i * 100 for i in range(50)], + 'device': [f"device{i}" for i in range(50)], + 'value': [i * 1.5 for i in range(50)] + }) + writer.write_dataframe(df) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) + assert df_read.shape == (50, 3) + assert df_read["time"].equals(df_sorted["time"]) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["value"].equals(df_sorted["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + +def test_write_dataframe_schema_time_and_dataframe_time(): + table = TableSchema("test_table", + [ColumnSchema("device", TSDataType.STRING, ColumnCategory.TAG), + ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) + tsfile_path = "test_write_dataframe_schema_and_df_time.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + with TsFileTableWriter(tsfile_path, table) as writer: + df = pd.DataFrame({ + 'Time': [i for i in range(30)], + 'device': [f"dev{i}" for i in range(30)], + 'value': [float(i) for i in range(30)] + }) + writer.write_dataframe(df) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_read = df_read.sort_values('time').reset_index(drop=True) + df_sorted = convert_to_nullable_types( + df.sort_values('Time').rename(columns=str.lower).reset_index(drop=True) + ) + assert df_read.shape == (30, 3) + assert df_read["time"].equals(df_sorted["time"]) + assert df_read["device"].equals(df_sorted["device"]) + assert df_read["value"].equals(df_sorted["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + def test_write_dataframe_empty(): table = TableSchema("test_table", [ColumnSchema("value", TSDataType.DOUBLE, ColumnCategory.FIELD)]) @@ -237,7 +306,7 @@ def test_write_dataframe_empty(): try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - + with TsFileTableWriter(tsfile_path, table) as writer: df = pd.DataFrame({ 'time': [], @@ -249,3 +318,5 @@ def test_write_dataframe_empty(): finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) + + diff --git a/python/tests/test_write_and_read.py b/python/tests/test_write_and_read.py index 1ffc22b99..3cef99c4a 100644 --- a/python/tests/test_write_and_read.py +++ b/python/tests/test_write_and_read.py @@ -84,7 +84,7 @@ def test_row_record_write_and_read(): assert result.get_value_by_index(4) == row_num * 2 assert result.get_value_by_index(5) == f"string_value_{row_num}" assert result.get_value_by_index(6) == f"text_value_{row_num}" - assert result.get_value_by_index(7) == f"blob_data_{row_num}" + assert result.get_value_by_index(7) == f"blob_data_{row_num}".encode('utf-8') assert result.get_value_by_index(8) == date(2025, 1, row_num % 20 + 1) assert result.get_value_by_index(9) == row_num diff --git a/python/tsfile/constants.py b/python/tsfile/constants.py index d4f87200d..6f233e271 100644 --- a/python/tsfile/constants.py +++ b/python/tsfile/constants.py @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. # +from datetime import datetime from enum import unique, IntEnum import numpy as np - @unique class TSDataType(IntEnum): BOOLEAN = 0 @@ -32,6 +32,11 @@ class TSDataType(IntEnum): BLOB = 10 STRING = 11 + def is_compatible_with(self, other: 'TSDataType') -> bool: + if self == other: + return True + return other in _TSDATATYPE_COMPATIBLE_SOURCES.get(self, ()) + def to_py_type(self): if self == TSDataType.BOOLEAN: return bool @@ -73,7 +78,7 @@ def to_pandas_dtype(self): elif self == TSDataType.DATE: return "object" elif self == TSDataType.BLOB: - return "bytes" + return "object" else: raise ValueError(f"Unknown data type: {self}") @@ -145,10 +150,19 @@ def from_pandas_datatype(cls, dtype): if dtype_str.startswith('datetime64'): return cls.TIMESTAMP - + return cls.STRING +_TSDATATYPE_COMPATIBLE_SOURCES = { + TSDataType.INT64: (TSDataType.INT32, TSDataType.TIMESTAMP), + TSDataType.STRING: (TSDataType.TEXT,), + TSDataType.TEXT: (TSDataType.STRING,), + TSDataType.DOUBLE: (TSDataType.FLOAT,), + TSDataType.TIMESTAMP: (TSDataType.INT64, TSDataType.INT32) +} + + @unique @@ -186,3 +200,5 @@ class Compressor(IntEnum): class ColumnCategory(IntEnum): TAG = 0 FIELD = 1 + ATTRIBUTE = 2 + TIME = 3 diff --git a/python/tsfile/exceptions.py b/python/tsfile/exceptions.py index 2a3df283a..a02f392ce 100644 --- a/python/tsfile/exceptions.py +++ b/python/tsfile/exceptions.py @@ -23,7 +23,7 @@ class LibraryError(Exception): def __init__(self, code=None, context=None): self.code = code if code is not None else self._default_code self.message = context if context is not None else self._default_message - super().__init__(f"[{code}] {self.message}") + super().__init__(f"[{self.code}] {self.message}") def __str__(self): return f"{self.code}: {self.message}" diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index 3aa1313cd..379307da5 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -53,7 +53,6 @@ def __repr__(self): return f"TimeseriesSchema({self.timeseries_name}, {self.data_type.name}, {self.encoding_type.name}, {self.compression_type.name})" - class DeviceSchema: """Represents a device entity containing multiple time series.""" @@ -73,6 +72,7 @@ def get_timeseries_list(self): def __repr__(self): return f"DeviceSchema({self.device_name}, {self.timeseries_list})" + class ColumnSchema: """Defines schema for a table column (name, datatype, category).""" @@ -85,6 +85,9 @@ def __init__(self, column_name: str, data_type: TSDataType, category: ColumnCate self.column_name = column_name.lower() if data_type is None: raise ValueError("Data type cannot be None") + if category == ColumnCategory.TIME and data_type not in [TSDataType.INT64, TSDataType.TIMESTAMP]: + raise TypeError(f"Time Column should have type : INT64/Timestamp," + f" but got {data_type}") self.data_type = data_type self.category = category @@ -105,6 +108,7 @@ class TableSchema: """Schema definition for a table structure.""" table_name = None columns = None + time_column = None def __init__(self, table_name: str, columns: List[ColumnSchema]): if table_name is None or len(table_name) == 0: @@ -113,6 +117,14 @@ def __init__(self, table_name: str, columns: List[ColumnSchema]): if len(columns) == 0: raise ValueError("Columns cannot be empty") self.columns = columns + for column in self.columns: + if column.get_category() == ColumnCategory.TIME: + if self.time_column is not None: + raise ValueError( + f"Table '{self.table_name}' cannot have multiple time columns: " + f"'{self.time_column.name}' and '{column.name}'" + ) + self.time_column = column def get_table_name(self): return self.table_name @@ -120,9 +132,39 @@ def get_table_name(self): def get_columns(self): return self.columns + def get_column(self, column_name: str): + name_lower = column_name.lower() + for col in self.columns: + if col.get_column_name() == name_lower: + return col + return None + + def get_time_column(self): + return self.time_column + def get_column_names(self): return [name.get_column_name() for name in self.columns] + def get_field_columns(self): + return [ + column + for column in self.columns + if column.get_category() == ColumnCategory.FIELD + ] + + def get_tag_columns(self): + return [ + column + for column in self.columns + if column.get_category() == ColumnCategory.TAG + ] + + + def add_column(self, column: ColumnSchema): + if column.get_category() == ColumnCategory.TIME: + self.time_column = column + self.columns.append(column) + def __repr__(self) -> str: return f"TableSchema({self.table_name}, {self.columns})" diff --git a/python/tsfile/tsfile_cpp.pxd b/python/tsfile/tsfile_cpp.pxd index ab915fefe..9c65fb26f 100644 --- a/python/tsfile/tsfile_cpp.pxd +++ b/python/tsfile/tsfile_cpp.pxd @@ -76,7 +76,10 @@ cdef extern from "./tsfile_cwrapper.h": ctypedef enum ColumnCategory: TAG = 0, - FIELD = 1 + FIELD = 1, + ATTRIBUTE = 2, + TIME = 3 + # struct types ctypedef struct ColumnSchema: diff --git a/python/tsfile/tsfile_py_cpp.pxd b/python/tsfile/tsfile_py_cpp.pxd index 9ce2f90da..2389aa9a6 100644 --- a/python/tsfile/tsfile_py_cpp.pxd +++ b/python/tsfile/tsfile_py_cpp.pxd @@ -33,7 +33,7 @@ cdef public api DeviceSchema* to_c_device_schema(object py_schema) cdef public api ColumnSchema* to_c_column_schema(object py_schema) cdef public api TableSchema* to_c_table_schema(object py_schema) cdef public api Tablet to_c_tablet(object tablet) -cdef public api Tablet dataframe_to_c_tablet(object target_name, object dataframe) +cdef public api Tablet dataframe_to_c_tablet(object target_name, object dataframe, object table_schema) cdef public api TsRecord to_c_record(object row_record) cdef public api void free_c_table_schema(TableSchema* c_schema) cdef public api void free_c_column_schema(ColumnSchema* c_schema) diff --git a/python/tsfile/tsfile_py_cpp.pyx b/python/tsfile/tsfile_py_cpp.pyx index 851346853..b8bd73d0b 100644 --- a/python/tsfile/tsfile_py_cpp.pyx +++ b/python/tsfile/tsfile_py_cpp.pyx @@ -16,6 +16,7 @@ # under the License. # #cython: language_level=3 +from datetime import date as date_type from .date_utils import parse_date_to_int from .tsfile_cpp cimport * @@ -29,7 +30,7 @@ from cpython.exc cimport PyErr_SetObject from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_AsUTF8, PyUnicode_AsUTF8AndSize from cpython.bytes cimport PyBytes_AsString, PyBytes_AsStringAndSize -from tsfile.exceptions import ERROR_MAPPING +from tsfile.exceptions import ERROR_MAPPING, TypeMismatchError from tsfile.schema import ResultSetMetaData as ResultSetMetaDataPy from tsfile.schema import TSDataType as TSDataTypePy, TSEncoding as TSEncodingPy from tsfile.schema import Compressor as CompressorPy, ColumnCategory as CategoryPy @@ -133,7 +134,9 @@ cdef dict COMPRESSION_TYPE_MAP = { cdef dict CATEGORY_MAP = { CategoryPy.TAG: ColumnCategory.TAG, - CategoryPy.FIELD: ColumnCategory.FIELD + CategoryPy.FIELD: ColumnCategory.FIELD, + CategoryPy.ATTRIBUTE: ColumnCategory.ATTRIBUTE, + CategoryPy.TIME: ColumnCategory.TIME } cdef TSDataType to_c_data_type(object data_type): @@ -321,7 +324,7 @@ cdef TSDataType check_string_or_blob(TSDataType ts_data_type, object dtype, obje return TS_DATATYPE_BLOB return ts_data_type -cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): +cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe, object table_schema): cdef Tablet ctablet cdef int max_row_num cdef TSDataType data_type @@ -342,17 +345,12 @@ cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): device_id_c = device_id_bytes df_columns = list(dataframe.columns) use_id_as_time = False - time_column_name = None - for col in df_columns: - if col.lower() == 'time': - time_column_name = col - break + time_column = table_schema.get_time_column() + use_id_as_time = time_column is None + time_column_name = None if time_column is None else time_column.get_column_name() - if time_column_name is None: - use_id_as_time = True - - data_columns = [col for col in df_columns if col.lower() != 'time'] + data_columns = [col for col in df_columns if col != time_column_name] column_num = len(data_columns) if column_num == 0: @@ -361,11 +359,9 @@ cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): max_row_num = len(dataframe) column_types_list = [] - for col_name in data_columns: - pandas_dtype = dataframe[col_name].dtype - ds_type = pandas_dtype_to_ts_data_type(pandas_dtype) - ds_type = check_string_or_blob(ds_type, pandas_dtype, dataframe[col_name]) - column_types_list.append(ds_type) + for column in data_columns: + data_type = table_schema.get_column(column).get_data_type() + column_types_list.append(data_type) columns_names = malloc(sizeof(char *) * column_num) columns_types = malloc(sizeof(TSDataType) * column_num) @@ -390,7 +386,7 @@ cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): timestamp = timestamp_py tablet_add_timestamp(ctablet, row, timestamp) else: - time_values = dataframe[time_column_name].values + time_values = dataframe[time_column.get_column_name()].values for row in range(max_row_num): timestamp_py = time_values[row] if pd.isna(timestamp_py): @@ -403,6 +399,31 @@ cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): data_type = column_types_list[col] column_values = dataframe[col_name].values + # Per-column validation for object types (check first non-null value only) + if data_type in (TS_DATATYPE_DATE, TS_DATATYPE_STRING, TS_DATATYPE_TEXT, TS_DATATYPE_BLOB): + col_series = dataframe[col_name] + first_valid_idx = col_series.first_valid_index() + if first_valid_idx is not None: + value = col_series[first_valid_idx] + if data_type == TS_DATATYPE_DATE: + if not isinstance(value, date_type): + raise TypeMismatchError(context= + f"Column '{col_name}': expected DATE (datetime.date), " + f"got {type(value).__name__}: {value!r}" + ) + elif data_type in (TS_DATATYPE_STRING, TS_DATATYPE_TEXT): + if not isinstance(value, str): + raise TypeMismatchError(context= + f"Column '{col_name}': expected STRING/TEXT, " + f"got {type(value).__name__}: {value!r}" + ) + elif data_type == TS_DATATYPE_BLOB: + if not isinstance(value, bytes): + raise TypeMismatchError(context= + f"Column '{col_name}': expected BLOB (bytes or bytearray), " + f"got {type(value).__name__}: {value!r}" + ) + # BOOLEAN if data_type == TS_DATATYPE_BOOLEAN: for row in range(max_row_num): @@ -433,13 +454,13 @@ cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): value = column_values[row] if not pd.isna(value): tablet_add_value_by_index_double(ctablet, row, col, value) - # DATE + # DATE (validated per-column above) elif data_type == TS_DATATYPE_DATE: for row in range(max_row_num): value = column_values[row] if not pd.isna(value): tablet_add_value_by_index_int32_t(ctablet, row, col, parse_date_to_int(value)) - # STRING or TEXT + # STRING or TEXT (validated per-column above) elif data_type == TS_DATATYPE_STRING or data_type == TS_DATATYPE_TEXT: for row in range(max_row_num): value = column_values[row] @@ -447,7 +468,7 @@ cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe): py_value = str(value) str_ptr = PyUnicode_AsUTF8AndSize(py_value, &raw_len) tablet_add_value_by_index_string_with_len(ctablet, row, col, str_ptr, raw_len) - # BLOB + # BLOB (validated per-column above) elif data_type == TS_DATATYPE_BLOB: for row in range(max_row_num): value = column_values[row] diff --git a/python/tsfile/tsfile_reader.pyx b/python/tsfile/tsfile_reader.pyx index 359492d6f..041764f91 100644 --- a/python/tsfile/tsfile_reader.pyx +++ b/python/tsfile/tsfile_reader.pyx @@ -24,6 +24,8 @@ from typing import List import pandas as pd from libc.stdint cimport INT64_MIN, INT64_MAX +from libc.string cimport strlen +from cpython.bytes cimport PyBytes_FromStringAndSize from tsfile.schema import TSDataType as TSDataTypePy from .date_utils import parse_int_to_date @@ -166,7 +168,7 @@ cdef class ResultSetPy: return tsfile_result_set_get_value_by_index_double(self.result, index) elif data_type == TSDataTypePy.BOOLEAN: return tsfile_result_set_get_value_by_index_bool(self.result, index) - elif data_type == TSDataTypePy.STRING or data_type == TSDataTypePy.TEXT or data_type == TSDataTypePy.BLOB: + elif data_type == TSDataTypePy.STRING or data_type == TSDataTypePy.TEXT: try: string = tsfile_result_set_get_value_by_index_string(self.result, index) if string == NULL: @@ -174,6 +176,14 @@ cdef class ResultSetPy: return string.decode('utf-8') finally: pass + elif data_type == TSDataTypePy.BLOB: + try: + string = tsfile_result_set_get_value_by_index_string(self.result, index) + if string == NULL: + return None + return PyBytes_FromStringAndSize(string, strlen(string)) + finally: + pass def get_value_by_name(self, column_name : str): """ diff --git a/python/tsfile/tsfile_table_writer.py b/python/tsfile/tsfile_table_writer.py index 5b33f9b2f..c11b78594 100644 --- a/python/tsfile/tsfile_table_writer.py +++ b/python/tsfile/tsfile_table_writer.py @@ -17,10 +17,44 @@ # import pandas as pd -from tsfile import TableSchema, Tablet, TableNotExistError -from tsfile import TsFileWriter, ColumnCategory +from tsfile import TableSchema, Tablet, TableNotExistError, ColumnCategory +from tsfile import TsFileWriter, ColumnSchema from tsfile.constants import TSDataType -from tsfile.exceptions import ColumnNotExistError, TypeMismatchError +from tsfile.exceptions import TypeMismatchError, ColumnNotExistError + + +def validate_dataframe_for_tsfile(df: pd.DataFrame) -> None: + if df is None or df.empty: + raise ValueError("DataFrame cannot be None or empty") + + columns = list(df.columns) + + seen = set() + duplicates = [] + for c in columns: + lower = c.lower() + if lower in seen: + duplicates.append(c) + seen.add(lower) + if duplicates: + raise ValueError( + f"Column names must be unique (case-insensitive). Duplicate columns: {duplicates}" + ) + + unsupported = [] + for col in columns: + dtype = df[col].dtype + try: + TSDataType.from_pandas_datatype(dtype) + except (ValueError, TypeError) as e: + unsupported.append((col, str(dtype), str(e))) + + if unsupported: + msg_parts = [f" - {col}: dtype={dtype}" for col, dtype in unsupported] + raise ValueError( + "Data types not supported by tsfile:\n" + "\n".join(msg_parts) + ) + def check_string_or_blob(ts_data_type: TSDataType, dtype, column_series: pd.Series) -> TSDataType: if ts_data_type == TSDataType.STRING and (dtype == 'object' or str(dtype) == ""): @@ -76,68 +110,65 @@ def write_dataframe(self, dataframe: pd.DataFrame): :raise: ColumnNotExistError if DataFrame columns don't match schema. :raise: TypeMismatchError if DataFrame column types are incompatible with schema. """ - if dataframe is None or dataframe.empty: - raise ValueError("DataFrame cannot be None or empty") - - # Create mapping from lowercase column name to original column name - df_column_name_map = {col.lower(): col for col in dataframe.columns if col.lower() != 'time'} - df_columns = list(df_column_name_map.keys()) - schema_column_names = set(self.tableSchema.get_column_names()) - df_columns_set = set(df_columns) + validate_dataframe_for_tsfile(dataframe) + + # rename columns to lowercase + dataframe = dataframe.rename(columns=str.lower) + time_column = self.tableSchema.get_time_column() + # tag columns used for sorting + tag_columns = self.tableSchema.get_tag_columns() + if time_column is None: + if 'time' in dataframe.columns: + dtype = TSDataType.from_pandas_datatype(dataframe['time'].dtype) + if not TSDataType.TIMESTAMP.is_compatible_with(dtype): + raise TypeMismatchError( + code=27, + context=f"time column require INT/Timestamp" + ) + + self.tableSchema.add_column(ColumnSchema("time", + TSDataType.TIMESTAMP, + ColumnCategory.TIME)) + time_column = self.tableSchema.get_time_column() - extra_columns = df_columns_set - schema_column_names - if extra_columns: - raise ColumnNotExistError( - code=50, - context=f"DataFrame has columns not in schema: {', '.join(sorted(extra_columns))}" - ) - - schema_column_map = { - col.get_column_name(): col for col in self.tableSchema.get_columns() - } - type_mismatches = [] - for col_name in df_columns: - df_col_name_original = df_column_name_map[col_name] - - df_dtype = dataframe[df_col_name_original].dtype - df_ts_type = TSDataType.from_pandas_datatype(df_dtype) - df_ts_type = check_string_or_blob(df_ts_type, df_dtype, dataframe[df_col_name_original]) - - schema_col = schema_column_map[col_name] - expected_ts_type = schema_col.get_data_type() - - if df_ts_type != expected_ts_type: - type_mismatches.append( - f"Column '{col_name}': expected {expected_ts_type.name}, got {df_ts_type.name}" - ) - + for col_name in dataframe.columns: + if time_column is not None and col_name == time_column.get_column_name(): + continue + schema_col = self.tableSchema.get_column(col_name) + if schema_col is None: + raise ColumnNotExistError(context=f"{col_name} is not define in table schema") + # Object dtype can represent STRING, DATE, TEXT, BLOB; validation will be performed during insert, skip here + if schema_col.get_data_type() in [TSDataType.INT64, TSDataType.INT32, TSDataType.DOUBLE, TSDataType.FLOAT, + TSDataType.BOOLEAN, TSDataType.TIMESTAMP]: + df_dtype = dataframe[col_name].dtype + df_ts_type = TSDataType.from_pandas_datatype(df_dtype) + expected_ts_type = schema_col.get_data_type() + + if not expected_ts_type.is_compatible_with(df_ts_type): + type_mismatches.append( + f"Column '{col_name}': expected {expected_ts_type.name}, got {df_ts_type.name}" + ) + if type_mismatches: raise TypeMismatchError( code=27, context=f"Type mismatches: {'; '.join(type_mismatches)}" ) - tag_columns = [] - for col in self.tableSchema.get_columns(): - if col.get_category() == ColumnCategory.TAG: - tag_col_name = col.get_column_name() - if tag_col_name in df_column_name_map: - tag_columns.append(df_column_name_map[tag_col_name]) - - time_column = None - for col in dataframe.columns: - if col.lower() == 'time': - time_column = col - break - if time_column: - sort_by = tag_columns.copy() - sort_by.append(time_column) + time_column_name = time_column.get_column_name() + time_series = dataframe[time_column_name] + if time_series.isna().any(): + raise ValueError( + f"Time column '{time_column}' must not contain null/NaN values" + ) + sort_by = [column.get_column_name() for column in tag_columns] + sort_by.append(time_column_name) dataframe = dataframe.sort_values(by=sort_by) - self.writer.write_dataframe(self.tableSchema.get_table_name(), dataframe) + self.writer.write_dataframe(self.tableSchema.get_table_name(), dataframe, self.tableSchema) def close(self): """ diff --git a/python/tsfile/tsfile_writer.pyx b/python/tsfile/tsfile_writer.pyx index c558984e1..4826ef72d 100644 --- a/python/tsfile/tsfile_writer.pyx +++ b/python/tsfile/tsfile_writer.pyx @@ -86,8 +86,8 @@ cdef class TsFileWriterPy: finally: free_c_tablet(ctablet) - def write_dataframe(self, target_table: str, dataframe: pandas.DataFrame): - cdef Tablet ctablet = dataframe_to_c_tablet(target_table, dataframe) + def write_dataframe(self, target_table: str, dataframe: pandas.DataFrame, tableschema: TableSchemaPy): + cdef Tablet ctablet = dataframe_to_c_tablet(target_table, dataframe, tableschema) cdef ErrorCode errno try: errno = _tsfile_writer_write_table(self.writer, ctablet) From 18971c85b460f888bda793be649c4e9479328ddd Mon Sep 17 00:00:00 2001 From: ColinLee Date: Mon, 9 Feb 2026 03:03:10 +0800 Subject: [PATCH 04/13] tmp code. --- cpp/src/cwrapper/tsfile_cwrapper.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc index 7c22ccd5c..5bba87b9a 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.cc +++ b/cpp/src/cwrapper/tsfile_cwrapper.cc @@ -116,6 +116,10 @@ TsFileWriter tsfile_writer_new(WriteFile file, TableSchema* schema, *err_code = common::E_INVALID_SCHEMA; return nullptr; } + // Ignore time column definition. + if (cur_schema.column_category == TIME) { + continue; + } column_schemas.emplace_back( cur_schema.column_name, From 1b3c27552605016f989c4292381221059131769a Mon Sep 17 00:00:00 2001 From: ColinLee Date: Mon, 9 Feb 2026 04:01:16 +0800 Subject: [PATCH 05/13] tmp code. --- cpp/src/cwrapper/tsfile_cwrapper.cc | 3 + python/tests/test_dataframe.py | 2 - python/tests/test_to_tsfile.py | 213 ++++++++++++++------------- python/tsfile/tsfile_py_cpp.pyx | 11 -- python/tsfile/tsfile_table_writer.py | 25 ++-- python/tsfile/utils.py | 86 +++++------ 6 files changed, 166 insertions(+), 174 deletions(-) diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc index 5bba87b9a..f384698ba 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.cc +++ b/cpp/src/cwrapper/tsfile_cwrapper.cc @@ -691,6 +691,9 @@ ERRNO _tsfile_writer_register_table(TsFileWriter writer, TableSchema* schema) { measurement_schemas.resize(schema->column_num); for (int i = 0; i < schema->column_num; i++) { ColumnSchema* cur_schema = schema->column_schemas + i; + if (cur_schema->column_category == TIME) { + continue; + } measurement_schemas[i] = new storage::MeasurementSchema( cur_schema->column_name, static_cast(cur_schema->data_type)); diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e3c923a3c..09d0001ba 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -318,5 +318,3 @@ def test_write_dataframe_empty(): finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) - - diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py index 7c1fb84ca..c3a970e3c 100644 --- a/python/tests/test_to_tsfile.py +++ b/python/tests/test_to_tsfile.py @@ -16,12 +16,13 @@ # under the License. # import os +from datetime import date import numpy as np import pandas as pd import pytest -from tsfile import to_dataframe +from tsfile import to_dataframe, TsFileReader, ColumnCategory from tsfile.utils import dataframe_to_tsfile @@ -71,6 +72,22 @@ def test_dataframe_to_tsfile_basic(): os.remove(tsfile_path) +def test_dataframe_to_tsfile_default_table_name(): + tsfile_path = "test_dataframe_to_tsfile_default.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({'time': [0, 1], 'value': [1.0, 2.0]}) + dataframe_to_tsfile(df, tsfile_path) + + df_read = to_dataframe(tsfile_path, table_name="default_table") + assert len(df_read) == 2 + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + def test_dataframe_to_tsfile_with_index(): tsfile_path = "test_dataframe_to_tsfile_index.tsfile" try: @@ -78,23 +95,23 @@ def test_dataframe_to_tsfile_with_index(): os.remove(tsfile_path) df = pd.DataFrame({ - 'device': [f"device{i}" for i in range(50)], - 'value': [i * 2.5 for i in range(50)] + 'device': [f"device{i}" for i in range(30)], + 'value': [i * 2.0 for i in range(30)] }) - df.index = [i * 10 for i in range(50)] - + df.index = [i * 100 for i in range(30)] dataframe_to_tsfile(df, tsfile_path, table_name="test_table") df_read = to_dataframe(tsfile_path, table_name="test_table") df_read = df_read.sort_values('time').reset_index(drop=True) - df_sorted = df.sort_index() - df_sorted = convert_to_nullable_types(df_sorted.reset_index(drop=True)) - time_series = pd.Series(df.sort_index().index.values, dtype='Int64') + time_expected = pd.Series(df.index.values, dtype='Int64') + assert df_read.shape == (30, 3) + assert df_read["time"].equals(time_expected) - assert df_read.shape == (50, 3) - assert df_read["time"].equals(time_series) - assert df_read["device"].equals(df_sorted["device"]) - assert df_read["value"].equals(df_sorted["value"]) + with TsFileReader(tsfile_path) as reader: + table_schema = reader.get_table_schema("test_table") + device_col = table_schema.get_column("device") + assert device_col is not None + assert device_col.get_category() == ColumnCategory.FIELD finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) @@ -127,6 +144,27 @@ def test_dataframe_to_tsfile_custom_time_column(): os.remove(tsfile_path) +def test_dataframe_to_tsfile_case_insensitive_time(): + tsfile_path = "test_dataframe_to_tsfile_case_time.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'Time': [i for i in range(20)], + 'value': [i * 2.0 for i in range(20)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table") + + df_read = to_dataframe(tsfile_path, table_name="test_table") + assert df_read.shape == (20, 2) + assert df_read["time"].equals(pd.Series([i for i in range(20)], dtype='Int64')) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + def test_dataframe_to_tsfile_with_tag_columns(): tsfile_path = "test_dataframe_to_tsfile_tags.tsfile" try: @@ -155,6 +193,34 @@ def test_dataframe_to_tsfile_with_tag_columns(): os.remove(tsfile_path) +def test_dataframe_to_tsfile_tag_time_unsorted(): + tsfile_path = "test_dataframe_to_tsfile_tag_time_unsorted.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({ + 'time': [30, 10, 20, 50, 40, 15, 25, 35, 5, 45], + 'device': ['device1', 'device1', 'device1', 'device2', 'device2', 'device1', 'device1', 'device2', + 'device1', 'device2'], + 'value': [i * 1.5 for i in range(10)] + }) + + dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device"]) + + df_read = to_dataframe(tsfile_path, table_name="test_table") + df_expected = df.sort_values(by=['device', 'time']).reset_index(drop=True) + df_expected = convert_to_nullable_types(df_expected) + + assert df_read.shape == (10, 3) + assert df_read["device"].equals(df_expected["device"]) + assert df_read["time"].equals(df_expected["time"]) + assert df_read["value"].equals(df_expected["value"]) + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + def test_dataframe_to_tsfile_all_datatypes(): tsfile_path = "test_dataframe_to_tsfile_all_types.tsfile" try: @@ -169,7 +235,10 @@ def test_dataframe_to_tsfile_all_datatypes(): 'float_col': pd.Series([i * 1.5 for i in range(50)], dtype='float32'), 'double_col': [i * 2.5 for i in range(50)], 'string_col': [f"str{i}" for i in range(50)], - 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)] + 'blob_col': [f"blob{i}".encode('utf-8') for i in range(50)], + 'text_col': [f"text{i}" for i in range(50)], + 'date_col': [date(2025, i % 11 + 1, i % 20 + 1) for i in range(50)], + 'timestamp_col': [i for i in range(50)] }) dataframe_to_tsfile(df, tsfile_path, table_name="test_table") @@ -178,13 +247,16 @@ def test_dataframe_to_tsfile_all_datatypes(): df_read = df_read.sort_values('time').reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) - assert df_read.shape == (50, 8) + assert df_read.shape == (50, 11) assert df_read["bool_col"].equals(df_sorted["bool_col"]) assert df_read["int32_col"].equals(df_sorted["int32_col"]) assert df_read["int64_col"].equals(df_sorted["int64_col"]) assert np.allclose(df_read["float_col"], df_sorted["float_col"]) assert np.allclose(df_read["double_col"], df_sorted["double_col"]) assert df_read["string_col"].equals(df_sorted["string_col"]) + assert df_read["text_col"].equals(df_sorted["text_col"]) + assert df_read["date_col"].equals(df_sorted["date_col"]) + assert df_read["timestamp_col"].equals(df_sorted["timestamp_col"]) for i in range(50): assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i] finally: @@ -192,47 +264,6 @@ def test_dataframe_to_tsfile_all_datatypes(): os.remove(tsfile_path) -def test_dataframe_to_tsfile_default_table_name(): - tsfile_path = "test_dataframe_to_tsfile_default_name.tsfile" - try: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - - df = pd.DataFrame({ - 'time': [i for i in range(10)], - 'value': [i * 1.0 for i in range(10)] - }) - - dataframe_to_tsfile(df, tsfile_path) - - df_read = to_dataframe(tsfile_path, table_name="test_dataframe_to_tsfile_default_name") - assert df_read.shape == (10, 2) - finally: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - - -def test_dataframe_to_tsfile_case_insensitive_time(): - tsfile_path = "test_dataframe_to_tsfile_case_time.tsfile" - try: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - - df = pd.DataFrame({ - 'Time': [i for i in range(20)], - 'value': [i * 2.0 for i in range(20)] - }) - - dataframe_to_tsfile(df, tsfile_path, table_name="test_table") - - df_read = to_dataframe(tsfile_path, table_name="test_table") - assert df_read.shape == (20, 2) - assert df_read["time"].equals(pd.Series([i for i in range(20)], dtype='Int64')) - finally: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - - def test_dataframe_to_tsfile_empty_dataframe(): tsfile_path = "test_dataframe_to_tsfile_empty.tsfile" try: @@ -265,6 +296,20 @@ def test_dataframe_to_tsfile_no_data_columns(): os.remove(tsfile_path) +def test_dataframe_to_tsfile_time_column_not_found(): + tsfile_path = "test_dataframe_to_tsfile_time_err.tsfile" + try: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + df = pd.DataFrame({'time': [0, 1], 'value': [1.0, 2.0]}) + with pytest.raises(ValueError, match="Time column 'timestamp' not found"): + dataframe_to_tsfile(df, tsfile_path, time_column="timestamp") + finally: + if os.path.exists(tsfile_path): + os.remove(tsfile_path) + + def test_dataframe_to_tsfile_invalid_time_column(): tsfile_path = "test_dataframe_to_tsfile_invalid_time.tsfile" try: @@ -301,17 +346,13 @@ def test_dataframe_to_tsfile_non_integer_time_column(): os.remove(tsfile_path) -def test_dataframe_to_tsfile_invalid_tag_column(): - tsfile_path = "test_dataframe_to_tsfile_invalid_tag.tsfile" +def test_dataframe_to_tsfile_tag_column_not_found(): + tsfile_path = "test_dataframe_to_tsfile_tag_err.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) - df = pd.DataFrame({ - 'time': [i for i in range(10)], - 'value': [i * 1.0 for i in range(10)] - }) - + df = pd.DataFrame({'time': [0, 1], 'device': ['a', 'b'], 'value': [1.0, 2.0]}) with pytest.raises(ValueError, match="Tag column 'invalid' not found"): dataframe_to_tsfile(df, tsfile_path, tag_column=["invalid"]) finally: @@ -319,55 +360,19 @@ def test_dataframe_to_tsfile_invalid_tag_column(): os.remove(tsfile_path) -def test_dataframe_to_tsfile_string_vs_blob(): - tsfile_path = "test_dataframe_to_tsfile_string_blob.tsfile" - try: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - - df = pd.DataFrame({ - 'time': [i for i in range(20)], - 'string_col': [f"str{i}" for i in range(20)], - 'blob_col': [f"blob{i}".encode('utf-8') for i in range(20)] - }) - - dataframe_to_tsfile(df, tsfile_path, table_name="test_table") - - df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) - df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) - - assert df_read["string_col"].equals(df_sorted["string_col"]) - for i in range(20): - assert df_read["blob_col"].iloc[i] == df_sorted["blob_col"].iloc[i] - finally: - if os.path.exists(tsfile_path): - os.remove(tsfile_path) - - -def test_dataframe_to_tsfile_tag_time_unsorted(): - tsfile_path = "test_dataframe_to_tsfile_tag_time_unsorted.tsfile" +def test_dataframe_to_tsfile_invalid_tag_column(): + tsfile_path = "test_dataframe_to_tsfile_invalid_tag.tsfile" try: if os.path.exists(tsfile_path): os.remove(tsfile_path) df = pd.DataFrame({ - 'time': [30, 10, 20, 50, 40, 15, 25, 35, 5, 45], - 'device': ['device1', 'device1', 'device1', 'device2', 'device2', 'device1', 'device1', 'device2', - 'device1', 'device2'], - 'value': [i * 1.5 for i in range(10)] + 'time': [i for i in range(10)], + 'value': [i * 1.0 for i in range(10)] }) - dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device"]) - - df_read = to_dataframe(tsfile_path, table_name="test_table") - df_expected = df.sort_values(by=['device', 'time']).reset_index(drop=True) - df_expected = convert_to_nullable_types(df_expected) - - assert df_read.shape == (10, 3) - assert df_read["device"].equals(df_expected["device"]) - assert df_read["time"].equals(df_expected["time"]) - assert df_read["value"].equals(df_expected["value"]) + with pytest.raises(ValueError, match="Tag column 'invalid' not found"): + dataframe_to_tsfile(df, tsfile_path, tag_column=["invalid"]) finally: if os.path.exists(tsfile_path): os.remove(tsfile_path) diff --git a/python/tsfile/tsfile_py_cpp.pyx b/python/tsfile/tsfile_py_cpp.pyx index b8bd73d0b..98b28673c 100644 --- a/python/tsfile/tsfile_py_cpp.pyx +++ b/python/tsfile/tsfile_py_cpp.pyx @@ -313,17 +313,6 @@ cdef Tablet to_c_tablet(object tablet): cdef TSDataType pandas_dtype_to_ts_data_type(object dtype): return to_c_data_type(TSDataTypePy.from_pandas_datatype(dtype)) -cdef TSDataType check_string_or_blob(TSDataType ts_data_type, object dtype, object column_series): - if ts_data_type == TS_DATATYPE_STRING: - dtype_str = str(dtype) - if dtype == 'object' or dtype_str == "": - first_valid_idx = column_series.first_valid_index() - if first_valid_idx is not None: - first_value = column_series[first_valid_idx] - if isinstance(first_value, bytes): - return TS_DATATYPE_BLOB - return ts_data_type - cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe, object table_schema): cdef Tablet ctablet cdef int max_row_num diff --git a/python/tsfile/tsfile_table_writer.py b/python/tsfile/tsfile_table_writer.py index c11b78594..1561d7c61 100644 --- a/python/tsfile/tsfile_table_writer.py +++ b/python/tsfile/tsfile_table_writer.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. # +from datetime import date, datetime + import pandas as pd from tsfile import TableSchema, Tablet, TableNotExistError, ColumnCategory @@ -56,15 +58,20 @@ def validate_dataframe_for_tsfile(df: pd.DataFrame) -> None: ) -def check_string_or_blob(ts_data_type: TSDataType, dtype, column_series: pd.Series) -> TSDataType: - if ts_data_type == TSDataType.STRING and (dtype == 'object' or str(dtype) == ""): - first_valid_idx = column_series.first_valid_index() - if first_valid_idx is not None: - first_value = column_series[first_valid_idx] - if isinstance(first_value, bytes): - return TSDataType.BLOB - return ts_data_type - +def infer_object_column_type(column_series: pd.Series) -> TSDataType: + first_valid_idx = column_series.first_valid_index() + if first_valid_idx is None: + return TSDataType.STRING + value = column_series[first_valid_idx] + if isinstance(value, (bytes, bytearray)): + return TSDataType.BLOB + if isinstance(value, (date, datetime)): + return TSDataType.DATE + if isinstance(value, str): + return TSDataType.STRING + raise TypeError( + f"Cannot infer type from object column: expected str/bytes/date, got {type(value).__name__}: {value!r}" + ) class TsFileTableWriter: """ diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index 567c4fe19..1bb7aa1d3 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py @@ -15,18 +15,17 @@ # specific language governing permissions and limitations # under the License. # -from pathlib import Path from typing import Iterator, Union from typing import Optional import numpy as np import pandas as pd -from pandas.core.dtypes.common import is_integer_dtype +from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype from tsfile import ColumnSchema, TableSchema, ColumnCategory, TSDataType from tsfile.exceptions import TableNotExistError, ColumnNotExistError from tsfile.tsfile_reader import TsFileReaderPy -from tsfile.tsfile_table_writer import TsFileTableWriter, check_string_or_blob +from tsfile.tsfile_table_writer import TsFileTableWriter, infer_object_column_type, validate_dataframe_for_tsfile def to_dataframe(file_path: str, @@ -189,7 +188,7 @@ def dataframe_to_tsfile(dataframe: pd.DataFrame, Path to the TsFile to write. Will be created if it doesn't exist. table_name : Optional[str], default None - Name of the table. If None, defaults to tsfile file name. + Name of the table. If None, defaults to tsfile file name (without extension). time_column : Optional[str], default None Name of the time column. If None, will look for a column named 'time' (case-insensitive), @@ -208,62 +207,53 @@ def dataframe_to_tsfile(dataframe: pd.DataFrame, ValueError If the DataFrame is empty or has no data columns. """ - if dataframe is None or dataframe.empty: - raise ValueError("DataFrame cannot be None or empty") + validate_dataframe_for_tsfile(dataframe) + df = dataframe.rename(columns=str.lower) - if table_name is None: - filename = Path(file_path).stem - table_name = filename + if not table_name: + table_name = "default_table" - time_col_name = None if time_column is not None: - if time_column not in dataframe.columns: + if time_column.lower() not in df.columns: raise ValueError(f"Time column '{time_column}' not found in DataFrame") - if not is_integer_dtype(dataframe[time_column].dtype): - raise TypeError( - f"Time column '{time_column}' must be integer type (int64 or int), got {dataframe[time_column].dtype}") - time_col_name = time_column - else: - for col in dataframe.columns: - if col.lower() == 'time': - if is_integer_dtype(dataframe[col].dtype): - time_col_name = col - break - else: - raise TypeError( - f"Time column '{col}' must be integer type (int64 or int), got {dataframe[col].dtype}") - - data_columns = [col for col in dataframe.columns if col != time_col_name] - - if len(data_columns) == 0: - raise ValueError("DataFrame must have at least one data column besides the time column") - - tag_columns_lower = [] if tag_column is not None: for tag_col in tag_column: - if tag_col not in dataframe.columns: + if tag_col.lower() not in df.columns: raise ValueError(f"Tag column '{tag_col}' not found in DataFrame") - tag_columns_lower.append(tag_col.lower()) + tag_columns_lower = {t.lower() for t in (tag_column or [])} - column_schemas = [] - for col_name in data_columns: - col_dtype = dataframe[col_name].dtype - ts_data_type = TSDataType.from_pandas_datatype(col_dtype) - ts_data_type = check_string_or_blob(ts_data_type, col_dtype, dataframe[col_name]) + if time_column is not None: + time_col_name = time_column.lower() + elif 'time' in df.columns: + time_col_name = 'time' + else: + time_col_name = None + + if time_col_name is not None: + if not is_integer_dtype(df[time_col_name].dtype): + raise TypeError( + f"Time column '{time_col_name}' must be integer type (int64 or int), got {df[time_col_name].dtype}") - if col_name.lower() in tag_columns_lower: - category = ColumnCategory.TAG + column_schemas = [] + if time_col_name is not None: + column_schemas.append(ColumnSchema(time_col_name, TSDataType.TIMESTAMP, ColumnCategory.TIME)) + + for col in df.columns: + if col == time_col_name: + continue + col_dtype = df[col].dtype + if is_object_dtype(col_dtype): + ts_data_type = infer_object_column_type(df[col]) else: - category = ColumnCategory.FIELD + ts_data_type = TSDataType.from_pandas_datatype(col_dtype) - column_schemas.append(ColumnSchema(col_name, ts_data_type, category)) + category = ColumnCategory.TAG if col in tag_columns_lower else ColumnCategory.FIELD + column_schemas.append(ColumnSchema(col, ts_data_type, category)) - table_schema = TableSchema(table_name, column_schemas) + if len(column_schemas) == 0: + raise ValueError("DataFrame must have at least one data column besides the time column") - if time_col_name is not None and time_col_name != 'time': - df_to_write = dataframe.rename(columns={time_col_name: 'time'}) - else: - df_to_write = dataframe + table_schema = TableSchema(table_name, column_schemas) with TsFileTableWriter(file_path, table_schema) as writer: - writer.write_dataframe(df_to_write) + writer.write_dataframe(df) From 0f91284c5c7491d5b3f13bace3fd81ed40b1e3ea Mon Sep 17 00:00:00 2001 From: ColinLee Date: Mon, 9 Feb 2026 09:07:05 +0800 Subject: [PATCH 06/13] tmp code. --- python/tsfile/schema.py | 17 +++++++++++++++-- python/tsfile/tsfile_py_cpp.pyx | 3 --- python/tsfile/tsfile_table_writer.py | 3 ++- python/tsfile/utils.py | 2 +- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index 379307da5..298f5eecc 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -17,6 +17,7 @@ # from typing import List +from . import TypeMismatchError from .constants import TSDataType, ColumnCategory, TSEncoding, Compressor @@ -88,6 +89,8 @@ def __init__(self, column_name: str, data_type: TSDataType, category: ColumnCate if category == ColumnCategory.TIME and data_type not in [TSDataType.INT64, TSDataType.TIMESTAMP]: raise TypeError(f"Time Column should have type : INT64/Timestamp," f" but got {data_type}") + elif category == ColumnCategory.TAG and data_type not in [TSDataType.STRING, TSDataType.TEXT]: + raise TypeMismatchError(context="Tag column should be string or text") self.data_type = data_type self.category = category @@ -159,11 +162,21 @@ def get_tag_columns(self): if column.get_category() == ColumnCategory.TAG ] - def add_column(self, column: ColumnSchema): if column.get_category() == ColumnCategory.TIME: + if self.time_column is not None: + raise ValueError( + f"Table '{self.table_name}' cannot have multiple time columns: " + f"'{self.time_column.name}' and '{column.name}'" + ) self.time_column = column - self.columns.append(column) + else: + for col in self.columns: + if col.get_column_name() == column.get_column_name(): + raise ValueError( + f"Duplicate column name {col.get_column_name()}" + ) + self.columns.append(column) def __repr__(self) -> str: return f"TableSchema({self.table_name}, {self.columns})" diff --git a/python/tsfile/tsfile_py_cpp.pyx b/python/tsfile/tsfile_py_cpp.pyx index 98b28673c..3ca79a2a1 100644 --- a/python/tsfile/tsfile_py_cpp.pyx +++ b/python/tsfile/tsfile_py_cpp.pyx @@ -310,9 +310,6 @@ cdef Tablet to_c_tablet(object tablet): return ctablet -cdef TSDataType pandas_dtype_to_ts_data_type(object dtype): - return to_c_data_type(TSDataTypePy.from_pandas_datatype(dtype)) - cdef Tablet dataframe_to_c_tablet(object target_name, object dataframe, object table_schema): cdef Tablet ctablet cdef int max_row_num diff --git a/python/tsfile/tsfile_table_writer.py b/python/tsfile/tsfile_table_writer.py index 1561d7c61..0346fd522 100644 --- a/python/tsfile/tsfile_table_writer.py +++ b/python/tsfile/tsfile_table_writer.py @@ -73,6 +73,7 @@ def infer_object_column_type(column_series: pd.Series) -> TSDataType: f"Cannot infer type from object column: expected str/bytes/date, got {type(value).__name__}: {value!r}" ) + class TsFileTableWriter: """ Facilitates writing structured table data into a TsFile with a specified schema. @@ -84,7 +85,7 @@ class TsFileTableWriter: according to that schema, and serialize this data into a TsFile. """ - def __init__(self, path: str, table_schema: TableSchema, memory_threshold=128 * 1024 * 1024): + def __init__(self, path: str, table_schema: TableSchema, memory_threshold = 128 * 1024 * 1024): """ :param path: The path of tsfile, will create if it doesn't exist. :param table_schema: describes the schema of the tables they want to write. diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index 1bb7aa1d3..71e213462 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py @@ -188,7 +188,7 @@ def dataframe_to_tsfile(dataframe: pd.DataFrame, Path to the TsFile to write. Will be created if it doesn't exist. table_name : Optional[str], default None - Name of the table. If None, defaults to tsfile file name (without extension). + Name of the table. If None, defaults to "default_table". time_column : Optional[str], default None Name of the time column. If None, will look for a column named 'time' (case-insensitive), From e625b76fc9a7a897577659b42968da629f14c690 Mon Sep 17 00:00:00 2001 From: ColinLee Date: Mon, 9 Feb 2026 09:14:23 +0800 Subject: [PATCH 07/13] fix import error. --- python/tests/test_basic.py | 2 +- python/tsfile/schema.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/test_basic.py b/python/tests/test_basic.py index 842a8fb44..675ef837f 100644 --- a/python/tests/test_basic.py +++ b/python/tests/test_basic.py @@ -17,7 +17,7 @@ # import numpy as np import pytest -from tsfile import schema, Field +from tsfile import Field from tsfile import Tablet from tsfile.constants import * from tsfile.schema import * diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index 298f5eecc..91732eee3 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -17,7 +17,7 @@ # from typing import List -from . import TypeMismatchError +from .exceptions import TypeMismatchError from .constants import TSDataType, ColumnCategory, TSEncoding, Compressor From 9e48483996ee585a64802fcdc5e65cc8442507ae Mon Sep 17 00:00:00 2001 From: ColinLee Date: Wed, 11 Feb 2026 00:25:38 +0800 Subject: [PATCH 08/13] tmp code. --- cpp/src/common/constant/tsfile_constant.h | 4 +-- cpp/src/common/global.cc | 2 +- cpp/src/cwrapper/tsfile_cwrapper.cc | 5 ---- cpp/src/utils/db_utils.h | 6 +++-- .../resources/table_with_time_column.tsfile | Bin 0 -> 644 bytes python/tests/test_dataframe.py | 24 +++++++++--------- python/tests/test_load_tsfile_from_iotdb.py | 23 ++++++++++++----- python/tests/test_to_tsfile.py | 12 ++++----- python/tests/test_write_and_read.py | 16 ++++++------ python/tsfile/constants.py | 18 ++++++------- python/tsfile/schema.py | 8 +++--- 11 files changed, 63 insertions(+), 55 deletions(-) create mode 100644 python/tests/resources/table_with_time_column.tsfile diff --git a/cpp/src/common/constant/tsfile_constant.h b/cpp/src/common/constant/tsfile_constant.h index d3f4dec1c..096c645ab 100644 --- a/cpp/src/common/constant/tsfile_constant.h +++ b/cpp/src/common/constant/tsfile_constant.h @@ -37,15 +37,15 @@ static const std::string BACK_QUOTE_STRING = "`"; static const std::string DOUBLE_BACK_QUOTE_STRING = "``"; static const unsigned char TIME_COLUMN_MASK = 0x80; +static const std::string TIME_COLUMN_NAME = "time"; static const unsigned char VALUE_COLUMN_MASK = 0x40; - -static const std::string TIME_COLUMN_ID = ""; static const int NO_STR_TO_READ = -1; static const std::regex IDENTIFIER_PATTERN("([a-zA-Z0-9_\\u2E80-\\u9FFF]+)"); static const std::regex NODE_NAME_PATTERN( "(\\*{0,2}[a-zA-Z0-9_\\u2E80-\\u9FFF]+\\*{0,2})"); static const int DEFAULT_SEGMENT_NUM_FOR_TABLE_NAME = 3; + } // namespace storage #endif diff --git a/cpp/src/common/global.cc b/cpp/src/common/global.cc index 37b8c1bb8..fd1d0132d 100644 --- a/cpp/src/common/global.cc +++ b/cpp/src/common/global.cc @@ -122,7 +122,7 @@ int init_common() { g_time_column_schema.data_type_ = INT64; g_time_column_schema.encoding_ = PLAIN; g_time_column_schema.compression_ = UNCOMPRESSED; - g_time_column_schema.column_name_ = std::string("time"); + g_time_column_schema.column_name_ = storage::TIME_COLUMN_NAME; return ret; } diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc index f384698ba..539d5b968 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.cc +++ b/cpp/src/cwrapper/tsfile_cwrapper.cc @@ -116,11 +116,6 @@ TsFileWriter tsfile_writer_new(WriteFile file, TableSchema* schema, *err_code = common::E_INVALID_SCHEMA; return nullptr; } - // Ignore time column definition. - if (cur_schema.column_category == TIME) { - continue; - } - column_schemas.emplace_back( cur_schema.column_name, static_cast(cur_schema.data_type), diff --git a/cpp/src/utils/db_utils.h b/cpp/src/utils/db_utils.h index 85d99b1a3..5a1dea8db 100644 --- a/cpp/src/utils/db_utils.h +++ b/cpp/src/utils/db_utils.h @@ -37,6 +37,7 @@ namespace common { extern TSEncoding get_value_encoder(TSDataType data_type); extern CompressionType get_default_compressor(); +// TODO: remove this. typedef struct FileID { int64_t seq_; // timestamp when create int32_t version_; @@ -64,13 +65,14 @@ typedef struct FileID { #endif } FileID; +// TODO: remove this. typedef uint16_t NodeID; struct TsID { NodeID db_nid_; NodeID device_nid_; NodeID measurement_nid_; - TsID() : db_nid_(0), device_nid_(0), measurement_nid_(0){}; + TsID() : db_nid_(0), device_nid_(0), measurement_nid_(0) {}; TsID(NodeID db_nid, NodeID device_nid, NodeID measurement_nid) : db_nid_(db_nid), @@ -157,7 +159,7 @@ struct TsID { * This enumeration class defines the supported categories for columns within a * table schema, distinguishing between tag and field columns. */ -enum class ColumnCategory { TAG = 0, FIELD = 1 }; +enum class ColumnCategory { TAG = 0, FIELD = 1, ATTRIBUTE = 2, TIME = 3 }; /** * @brief Represents the schema information for a single column. diff --git a/python/tests/resources/table_with_time_column.tsfile b/python/tests/resources/table_with_time_column.tsfile new file mode 100644 index 0000000000000000000000000000000000000000..66be782aee311538fd7e485c467ee22827c3cb84 GIT binary patch literal 644 zcmWG3cFW93Wnp0ADM?JqNi|~2$xm)&FlS?DF*BXSz`!5@#9R!F3?C$}F)%RBabJ7r z!UkW51TF^0h8PAeMuQwd)`kf+0<4DwuBfuL9<)8>Dpr!3Taa3mSW;S)D#^^wAu9fX z4QPnu|Ns9RY|orI3uGUD;W%^V0#|{I(%j6H%#uoZ7IqF<*$psN_B&@XGCY_C#!`kTFOpol?5d}sb#4-Oa>xQu6JrBqZpJOQdyAdm0y&umspUPoRO+m fQk(`%UwTkky-+__Kj)yxfDl&~V3aY3fKwj;m+iBw literal 0 HcmV?d00001 diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 09d0001ba..de49bc1ca 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -22,7 +22,7 @@ import pandas as pd import pytest -from tsfile import ColumnSchema, TableSchema, TSDataType +from tsfile import ColumnSchema, TableSchema, TSDataType, TIME_COLUMN from tsfile import TsFileTableWriter, ColumnCategory from tsfile import to_dataframe from tsfile.exceptions import ColumnNotExistError, TypeMismatchError @@ -70,10 +70,10 @@ def test_write_dataframe_basic(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (100, 4) - assert df_read["time"].equals(df_sorted["time"]) + assert df_read[TIME_COLUMN].equals(df_sorted["time"]) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) assert df_read["value2"].equals(df_sorted["value2"]) @@ -99,12 +99,12 @@ def test_write_dataframe_with_index(): df.index = [i * 10 for i in range(50)] # Set index as timestamps writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = df.sort_index() df_sorted = convert_to_nullable_types(df_sorted.reset_index(drop=True)) time_series = pd.Series(df.sort_index().index.values, dtype='Int64') assert df_read.shape == (50, 3) - assert df_read["time"].equals(time_series) + assert df_read[TIME_COLUMN].equals(time_series) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) finally: @@ -130,10 +130,10 @@ def test_write_dataframe_case_insensitive(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('Time').reset_index(drop=True)) assert df_read.shape == (30, 3) - assert df_read["time"].equals(df_sorted["Time"]) + assert df_read[TIME_COLUMN].equals(df_sorted["Time"]) assert df_read["device"].equals(df_sorted["Device"]) assert df_read["value"].equals(df_sorted["VALUE"]) finally: @@ -218,7 +218,7 @@ def test_write_dataframe_all_datatypes(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (50, 11) assert df_read["bool_col"].equals(df_sorted["bool_col"]) @@ -257,10 +257,10 @@ def test_write_dataframe_schema_time_column(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (50, 3) - assert df_read["time"].equals(df_sorted["time"]) + assert df_read[TIME_COLUMN].equals(df_sorted[TIME_COLUMN]) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) finally: @@ -286,7 +286,7 @@ def test_write_dataframe_schema_time_and_dataframe_time(): writer.write_dataframe(df) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types( df.sort_values('Time').rename(columns=str.lower).reset_index(drop=True) ) @@ -312,7 +312,7 @@ def test_write_dataframe_empty(): 'time': [], 'value': [] }) - with pytest.raises(ValueError) as err: + with pytest.raises(ValueError): writer.write_dataframe(df) finally: diff --git a/python/tests/test_load_tsfile_from_iotdb.py b/python/tests/test_load_tsfile_from_iotdb.py index d865dd357..8dcc0b1c6 100644 --- a/python/tests/test_load_tsfile_from_iotdb.py +++ b/python/tests/test_load_tsfile_from_iotdb.py @@ -15,12 +15,13 @@ # specific language governing permissions and limitations # under the License. # - +import math import os import numpy as np import tsfile as ts +from tsfile import TIME_COLUMN def test_load_tsfile_from_iotdb(): @@ -31,8 +32,8 @@ def test_load_tsfile_from_iotdb(): ## -------- assert len(df) == 105, "row count mismatch" - assert df["time"].isna().sum() == 0 - assert int(df["time"].sum()) == 15960 + assert df[TIME_COLUMN].isna().sum() == 0 + assert int(df[TIME_COLUMN].sum()) == 15960 assert df["temperature"].isna().sum() == 5 assert df["status"].isna().sum() == 5 assert (df["status"] == True).sum() == 50 @@ -44,8 +45,8 @@ def test_load_tsfile_from_iotdb(): df = ts.to_dataframe(simple_tabl1_path) ## --------- assert len(df) == 60 - assert df["time"].isna().sum() == 0 - assert df["time"].sum() == ( + assert df[TIME_COLUMN].isna().sum() == 0 + assert df[TIME_COLUMN].sum() == ( (1760106020000 + 1760106049000) * 30 // 2 + (1760106080000 + 1760106109000) * 30 // 2 ) @@ -78,8 +79,8 @@ def test_load_tsfile_from_iotdb(): df = ts.to_dataframe(simple_tabl2_path) ## --------- assert len(df) == 40 - assert df["time"].isna().sum() == 0 - assert int(df["time"].sum()) == 70404242080000 + assert df[TIME_COLUMN].isna().sum() == 0 + assert int(df[TIME_COLUMN].sum()) == 70404242080000 assert df["s0"].isna().sum() == 0 assert df["s1"].isna().sum() == 0 @@ -109,3 +110,11 @@ def test_load_tsfile_from_iotdb(): assert df["s9"].isna().sum() == 5 ## --------- + table_with_time_column_path = os.path.join(dir_path, 'table_with_time_column.tsfile') + df = ts.to_dataframe(table_with_time_column_path) + + assert len(df) == 25 + assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) + assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9) + assert (df["region_id"] == "loc").sum() == 25 + diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py index c3a970e3c..a35d5e890 100644 --- a/python/tests/test_to_tsfile.py +++ b/python/tests/test_to_tsfile.py @@ -22,7 +22,7 @@ import pandas as pd import pytest -from tsfile import to_dataframe, TsFileReader, ColumnCategory +from tsfile import to_dataframe, TsFileReader, ColumnCategory, TIME_COLUMN from tsfile.utils import dataframe_to_tsfile @@ -132,11 +132,11 @@ def test_dataframe_to_tsfile_custom_time_column(): dataframe_to_tsfile(df, tsfile_path, table_name="test_table", time_column="timestamp") df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('timestamp').reset_index(drop=True)) assert df_read.shape == (30, 3) - assert df_read["time"].equals(df_sorted["timestamp"]) + assert df_read[TIME_COLUMN].equals(df_sorted["timestamp"]) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) finally: @@ -181,7 +181,7 @@ def test_dataframe_to_tsfile_with_tag_columns(): dataframe_to_tsfile(df, tsfile_path, table_name="test_table", tag_column=["device", "location"]) df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (20, 4) @@ -214,7 +214,7 @@ def test_dataframe_to_tsfile_tag_time_unsorted(): assert df_read.shape == (10, 3) assert df_read["device"].equals(df_expected["device"]) - assert df_read["time"].equals(df_expected["time"]) + assert df_read[TIME_COLUMN].equals(df_expected["time"]) assert df_read["value"].equals(df_expected["value"]) finally: if os.path.exists(tsfile_path): @@ -244,7 +244,7 @@ def test_dataframe_to_tsfile_all_datatypes(): dataframe_to_tsfile(df, tsfile_path, table_name="test_table") df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values('time').reset_index(drop=True) + df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('time').reset_index(drop=True)) assert df_read.shape == (50, 11) diff --git a/python/tests/test_write_and_read.py b/python/tests/test_write_and_read.py index 3cef99c4a..57294a846 100644 --- a/python/tests/test_write_and_read.py +++ b/python/tests/test_write_and_read.py @@ -25,7 +25,7 @@ from pandas import Float64Dtype from pandas.core.dtypes.common import is_integer_dtype -from tsfile import ColumnSchema, TableSchema, TSEncoding +from tsfile import ColumnSchema, TableSchema, TSEncoding, TIME_COLUMN from tsfile import Compressor from tsfile import TSDataType from tsfile import Tablet, RowRecord, Field @@ -170,7 +170,7 @@ def _extract_device(row, path_columns): assert df_all.shape[0] == total_rows for measurement in all_measurements: assert measurement in df_all.columns - assert "time" in df_all.columns + assert TIME_COLUMN in df_all.columns path_columns = sorted( [col for col in df_all.columns if col.startswith("col_")], key=lambda name: int(name.split("_")[1]), @@ -179,7 +179,7 @@ def _extract_device(row, path_columns): for _, row in df_all.iterrows(): device = _extract_device(row, path_columns) - timestamp = int(row["time"]) + timestamp = int(row[TIME_COLUMN]) assert (device, timestamp) in expected_values expected_row = expected_values[(device, timestamp)] for measurement in all_measurements: @@ -201,7 +201,7 @@ def _extract_device(row, path_columns): assert measurement not in df_subset.columns for _, row in df_subset.iterrows(): device = _extract_device(row, path_columns) - timestamp = int(row["time"]) + timestamp = int(row[TIME_COLUMN]) expected_row = expected_values[(device, timestamp)] for measurement in requested_columns: value = row.get(measurement) @@ -227,7 +227,7 @@ def _extract_device(row, path_columns): iter_rows = 0 for batch in iterator: assert isinstance(batch, pd.DataFrame) - assert set(batch.columns).issuperset({"time", "level"}) + assert set(batch.columns).issuperset({TIME_COLUMN, "level"}) iter_rows += len(batch) assert iter_rows == 18 @@ -242,7 +242,7 @@ def _extract_device(row, path_columns): iter_rows = 0 for batch in iterator: assert isinstance(batch, pd.DataFrame) - assert set(batch.columns).issuperset({"time", "level"}) + assert set(batch.columns).issuperset({TIME_COLUMN, "level"}) iter_rows += len(batch) assert iter_rows == 9 @@ -384,7 +384,7 @@ def test_table_writer_and_reader(): 0, 10) as result: cur_line = 0 while result.next(): - cur_time = result.get_value_by_name("time") + cur_time = result.get_value_by_name(TIME_COLUMN) assert result.get_value_by_name("device") == "device" + str(cur_time) assert result.is_null_by_name("device") == False assert result.is_null_by_name("value") == False @@ -545,7 +545,7 @@ def test_tsfile_to_df(): df1 = to_dataframe("table_write_to_df.tsfile") assert df1.shape == (4097, 4) assert df1["value2"].sum() == 100 * (1 + 4096) / 2 * 4096 - assert is_integer_dtype(df1["time"]) + assert is_integer_dtype(df1[TIME_COLUMN]) assert df1["value"].dtype == Float64Dtype() assert is_integer_dtype(df1["value2"]) df2 = to_dataframe("table_write_to_df.tsfile", column_names=["device", "value2"]) diff --git a/python/tsfile/constants.py b/python/tsfile/constants.py index 6f233e271..18da3aef7 100644 --- a/python/tsfile/constants.py +++ b/python/tsfile/constants.py @@ -15,10 +15,12 @@ # specific language governing permissions and limitations # under the License. # -from datetime import datetime from enum import unique, IntEnum + import numpy as np +TIME_COLUMN = "time" + @unique class TSDataType(IntEnum): BOOLEAN = 0 @@ -103,7 +105,7 @@ def from_pandas_datatype(cls, dtype): return cls.STRING except (ImportError, AttributeError): pass - + if hasattr(dtype, 'type'): dtype = dtype.type if dtype is np.bool_: @@ -118,12 +120,12 @@ def from_pandas_datatype(cls, dtype): return cls.DOUBLE elif dtype is np.object_: return cls.STRING - + dtype_str = str(dtype) if 'stringdtype' in dtype_str.lower() or dtype_str.startswith('string'): return cls.STRING - + dtype_map = { 'bool': cls.BOOLEAN, 'boolean': cls.BOOLEAN, @@ -137,17 +139,17 @@ def from_pandas_datatype(cls, dtype): 'object': cls.STRING, 'string': cls.STRING, } - + if dtype_str in dtype_map: return dtype_map[dtype_str] - + dtype_lower = dtype_str.lower() if dtype_lower in dtype_map: return dtype_map[dtype_lower] if 'object_' in dtype_lower or dtype_str == "": return cls.STRING - + if dtype_str.startswith('datetime64'): return cls.TIMESTAMP @@ -163,8 +165,6 @@ def from_pandas_datatype(cls, dtype): } - - @unique class TSEncoding(IntEnum): PLAIN = 0 diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index 91732eee3..f0fa39b1f 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -119,15 +119,17 @@ def __init__(self, table_name: str, columns: List[ColumnSchema]): self.table_name = table_name.lower() if len(columns) == 0: raise ValueError("Columns cannot be empty") - self.columns = columns - for column in self.columns: + self.columns = [] + for column in columns: if column.get_category() == ColumnCategory.TIME: if self.time_column is not None: raise ValueError( f"Table '{self.table_name}' cannot have multiple time columns: " - f"'{self.time_column.name}' and '{column.name}'" + f"'{self.time_column.get_column_name()}' and '{column.get_column_name()}'" ) self.time_column = column + else: + self.columns.append(column) def get_table_name(self): return self.table_name From 6df963f2f7b54c3cbd94230bf2de143a4a4fcbb1 Mon Sep 17 00:00:00 2001 From: ColinLee Date: Wed, 11 Feb 2026 00:35:03 +0800 Subject: [PATCH 09/13] tmp code. --- cpp/src/utils/db_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/utils/db_utils.h b/cpp/src/utils/db_utils.h index 5a1dea8db..607144af1 100644 --- a/cpp/src/utils/db_utils.h +++ b/cpp/src/utils/db_utils.h @@ -72,7 +72,7 @@ struct TsID { NodeID device_nid_; NodeID measurement_nid_; - TsID() : db_nid_(0), device_nid_(0), measurement_nid_(0) {}; + TsID() : db_nid_(0), device_nid_(0), measurement_nid_(0){}; TsID(NodeID db_nid, NodeID device_nid, NodeID measurement_nid) : db_nid_(db_nid), From 1a8f90d53e5fad365044d1e61bff4434ab86ac10 Mon Sep 17 00:00:00 2001 From: ColinLee Date: Wed, 11 Feb 2026 09:54:10 +0800 Subject: [PATCH 10/13] tmp code. --- cpp/src/cwrapper/tsfile_cwrapper.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc index 539d5b968..fbcf4e6f1 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.cc +++ b/cpp/src/cwrapper/tsfile_cwrapper.cc @@ -686,9 +686,6 @@ ERRNO _tsfile_writer_register_table(TsFileWriter writer, TableSchema* schema) { measurement_schemas.resize(schema->column_num); for (int i = 0; i < schema->column_num; i++) { ColumnSchema* cur_schema = schema->column_schemas + i; - if (cur_schema->column_category == TIME) { - continue; - } measurement_schemas[i] = new storage::MeasurementSchema( cur_schema->column_name, static_cast(cur_schema->data_type)); From 260490647ae3db7b4ed9fd5f399bef445845c20d Mon Sep 17 00:00:00 2001 From: ColinLee Date: Wed, 11 Feb 2026 10:05:05 +0800 Subject: [PATCH 11/13] add table_with_time_column info. --- python/tests/resources/README.md | 34 +++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/python/tests/resources/README.md b/python/tests/resources/README.md index ca80bb430..cd1a2aa04 100644 --- a/python/tests/resources/README.md +++ b/python/tests/resources/README.md @@ -282,4 +282,36 @@ IoTDB:test> select * from test; |2025-10-10T22:21:19.000+08:00| b| c|1069|7.9|v69|1970-01-01T08:00:01.069+08:00| 79|16.9|2024-12-18|text69| +-----------------------------+--+--+----+---+---+-----------------------------+----+----+----------+------+ Total line number = 40 -``` \ No newline at end of file +``` + +In `table_with_time_column.tsfile` + +``` + time region_id temperature humidity +0 1770729095888 loc 0.1 0.1 +1 1770729096807 loc 0.1 0.1 +2 1770729097233 loc 0.1 0.1 +3 1770729097471 loc 0.1 0.1 +4 1770729097695 loc 0.1 0.1 +5 1770729097910 loc 0.1 0.1 +6 1770729098148 loc 0.1 0.1 +7 1770729098385 loc 0.1 0.1 +8 1770729098599 loc 0.1 0.1 +9 1770729098853 loc 0.1 0.1 +10 1770729099086 loc 0.1 0.1 +11 1770729099327 loc 0.1 0.1 +12 1770729099558 loc 0.1 0.1 +13 1770729099794 loc 0.1 0.1 +14 1770729100017 loc 0.1 0.1 +15 1770729100262 loc 0.1 0.1 +16 1770729100492 loc 0.1 0.1 +17 1770729100729 loc 0.1 0.1 +18 1770729100976 loc 0.1 0.1 +19 1770729101243 loc 0.1 0.1 +20 1770729101494 loc 0.1 0.1 +21 1770729101734 loc 0.1 0.1 +22 1770729102040 loc 0.1 0.1 +23 1770729102333 loc 0.1 0.1 +24 1770729103005 loc 0.1 0.1 +``` + From 77da19ed6f30c121fcaa641156c9577fc4f5c52a Mon Sep 17 00:00:00 2001 From: ColinLee Date: Thu, 12 Feb 2026 21:15:40 +0800 Subject: [PATCH 12/13] tmp code. --- cpp/src/common/tsblock/tuple_desc.h | 9 +++ .../block/single_device_tsblock_reader.cc | 8 ++ cpp/src/reader/column_mapping.h | 11 ++- cpp/src/reader/table_query_executor.cc | 11 +-- .../table_view/tsfile_reader_table_test.cc | 81 +++++++++++++++++++ .../table_view/tsfile_writer_table_test.cc | 2 +- python/tsfile/schema.py | 7 ++ python/tsfile/tsfile_reader.pyx | 4 +- python/tsfile/utils.py | 36 +++++++-- 9 files changed, 151 insertions(+), 18 deletions(-) diff --git a/cpp/src/common/tsblock/tuple_desc.h b/cpp/src/common/tsblock/tuple_desc.h index 6010d677b..3cd26b3f6 100644 --- a/cpp/src/common/tsblock/tuple_desc.h +++ b/cpp/src/common/tsblock/tuple_desc.h @@ -76,6 +76,15 @@ class TupleDesc { return column_list_[index].column_category_; } + FORCE_INLINE int get_time_column_index() const { + for (uint32_t i = 0; i < column_list_.size(); i++) { + if (column_list_[i].get_column_category() == ColumnCategory::TIME) { + return i; + } + } + return -1; + } + FORCE_INLINE std::string get_column_name(uint32_t index) { return column_list_[index].column_name_; } diff --git a/cpp/src/reader/block/single_device_tsblock_reader.cc b/cpp/src/reader/block/single_device_tsblock_reader.cc index 0e2b350c7..836ab6956 100644 --- a/cpp/src/reader/block/single_device_tsblock_reader.cc +++ b/cpp/src/reader/block/single_device_tsblock_reader.cc @@ -164,6 +164,14 @@ int SingleDeviceTsBlockReader::fill_measurements( } col_appenders_[time_column_index_]->append((const char*)&next_time_, sizeof(next_time_)); + int time_in_query_index = tuple_desc_.get_time_column_index(); + if (time_in_query_index != -1) { + if (!col_appenders_[time_in_query_index]->add_row()) { + assert(false); + } + col_appenders_[time_in_query_index]->append( + (const char*)&next_time_, sizeof(next_time_)); + } for (auto& column_context : column_contexts) { column_context->fill_into(col_appenders_); if (RET_FAIL(advance_column(column_context))) { diff --git a/cpp/src/reader/column_mapping.h b/cpp/src/reader/column_mapping.h index abf9eafba..99e153030 100644 --- a/cpp/src/reader/column_mapping.h +++ b/cpp/src/reader/column_mapping.h @@ -36,8 +36,10 @@ class ColumnMapping { if (column_category == common::ColumnCategory::TAG) { tag_columns_.insert(column_name); - } else { + } else if (column_category == common::ColumnCategory::FIELD) { field_columns_.insert(column_name); + } else if (column_category == common::ColumnCategory::TIME) { + time_column_ = column_name; } return common::E_OK; @@ -64,6 +66,10 @@ class ColumnMapping { return field_columns_.find(column_name) != field_columns_.end(); } + bool is_time(const std::string& column_name) const { + return time_column_ == column_name; + } + const std::unordered_set& get_id_columns() const { return tag_columns_; } @@ -72,8 +78,11 @@ class ColumnMapping { return field_columns_; } + const std::string get_time_column() const { return time_column_; } + private: std::unordered_map> column_pos_map; + std::string time_column_; std::unordered_set tag_columns_; std::unordered_set field_columns_; }; diff --git a/cpp/src/reader/table_query_executor.cc b/cpp/src/reader/table_query_executor.cc index 79b636b52..2a01a6d5c 100644 --- a/cpp/src/reader/table_query_executor.cc +++ b/cpp/src/reader/table_query_executor.cc @@ -65,9 +65,10 @@ int TableQueryExecutor::query(const std::string& table_name, } // column_mapping.add(*measurement_filter); - auto device_task_iterator = std::unique_ptr( - new DeviceTaskIterator(columns, table_root, column_mapping, - meta_data_querier_, id_filter, table_schema)); + auto device_task_iterator = + std::unique_ptr(new DeviceTaskIterator( + lower_case_column_names, table_root, column_mapping, + meta_data_querier_, id_filter, table_schema)); std::unique_ptr tsblock_reader; switch (table_query_ordering_) { @@ -82,8 +83,8 @@ int TableQueryExecutor::query(const std::string& table_name, ret = common::E_UNSUPPORTED_ORDER; } assert(tsblock_reader != nullptr); - ret_qds = - new TableResultSet(std::move(tsblock_reader), columns, data_types); + ret_qds = new TableResultSet(std::move(tsblock_reader), + lower_case_column_names, data_types); return ret; } diff --git a/cpp/test/reader/table_view/tsfile_reader_table_test.cc b/cpp/test/reader/table_view/tsfile_reader_table_test.cc index c281de413..b9f0eb213 100644 --- a/cpp/test/reader/table_view/tsfile_reader_table_test.cc +++ b/cpp/test/reader/table_view/tsfile_reader_table_test.cc @@ -707,3 +707,84 @@ TEST_F(TsFileTableReaderTest, TestNullInTable4) { ASSERT_EQ(line, max_rows); }); } + +TEST_F(TsFileTableReaderTest, TestTimeColumnReader) { + std::vector column_schemas; + column_schemas.emplace_back("s0", TSDataType::INT64, + CompressionType::UNCOMPRESSED, + TSEncoding::PLAIN, ColumnCategory::FIELD); + column_schemas.emplace_back("S1", TSDataType::DOUBLE, + CompressionType::UNCOMPRESSED, + TSEncoding::PLAIN, ColumnCategory::FIELD); + // No need to manually insert data into the time column. + column_schemas.emplace_back("TIME_D", TSDataType::TIMESTAMP, + CompressionType::UNCOMPRESSED, + TSEncoding::PLAIN, ColumnCategory::TIME); + + TableSchema table_schema("testTableTime", column_schemas); + auto tsfile_table_writer_ = + std::make_shared(&write_file_, &table_schema); + + const int num_rows = 20; + const int64_t base_time = 1000; + storage::Tablet tablet(table_schema.get_table_name(), {"s0", "s1"}, + {TSDataType::INT64, TSDataType::DOUBLE}, + {ColumnCategory::FIELD, ColumnCategory::FIELD}, + num_rows); + + for (int i = 0; i < num_rows; i++) { + int64_t t = base_time + i; + tablet.add_timestamp(i, t); + tablet.add_value(i, 0, static_cast(i * 10)); + tablet.add_value(i, 1, static_cast(i * 1.5)); + } + + ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK); + + storage::TsFileReader reader; + int ret = reader.open(file_name_); + ASSERT_EQ(ret, common::E_OK); + + ResultSet* tmp_result_set = nullptr; + ret = reader.query(table_schema.get_table_name(), {"s0", "s1", "TIME_D"}, 0, + 1000000000000, tmp_result_set); + ASSERT_EQ(ret, common::E_OK); + ASSERT_NE(tmp_result_set, nullptr); + + auto* table_result_set = dynamic_cast(tmp_result_set); + ASSERT_NE(table_result_set, nullptr); + + auto result_set_metadata = table_result_set->get_metadata(); + ASSERT_EQ(result_set_metadata->get_column_count(), + 4); // time + s0 + s1 + TIME_D + ASSERT_EQ(result_set_metadata->get_column_name(1), "time"); + ASSERT_EQ(result_set_metadata->get_column_type(1), TSDataType::INT64); + ASSERT_EQ(result_set_metadata->get_column_name(2), "s0"); + ASSERT_EQ(result_set_metadata->get_column_type(2), TSDataType::INT64); + ASSERT_EQ(result_set_metadata->get_column_name(3), "s1"); + ASSERT_EQ(result_set_metadata->get_column_type(3), TSDataType::DOUBLE); + ASSERT_EQ(result_set_metadata->get_column_name(4), "time_d"); + ASSERT_EQ(result_set_metadata->get_column_type(4), TSDataType::TIMESTAMP); + + bool has_next = false; + int row_count = 0; + while (IS_SUCC(table_result_set->next(has_next)) && has_next) { + int64_t row_time = base_time + row_count; + // Column 1 is built-in time + ASSERT_EQ(table_result_set->get_value(1), row_time); + // s0, s1 + ASSERT_EQ(table_result_set->get_value(2), row_count * 10); + ASSERT_DOUBLE_EQ(table_result_set->get_value(3), + static_cast(row_count * 1.5)); + // time_d + ASSERT_EQ(table_result_set->get_value("TIME_D"), row_time); + ASSERT_EQ(table_result_set->get_value(4), row_time); + row_count++; + } + ASSERT_EQ(row_count, num_rows); + + reader.destroy_query_data_set(table_result_set); + ASSERT_EQ(reader.close(), common::E_OK); +} diff --git a/cpp/test/writer/table_view/tsfile_writer_table_test.cc b/cpp/test/writer/table_view/tsfile_writer_table_test.cc index d5861ea16..1f8c80ff6 100644 --- a/cpp/test/writer/table_view/tsfile_writer_table_test.cc +++ b/cpp/test/writer/table_view/tsfile_writer_table_test.cc @@ -447,7 +447,7 @@ TEST_F(TsFileWriterTableTest, WriteAndReadSimple) { ASSERT_EQ(ret_value, 0); auto* table_result_set = (TableResultSet*)ret; auto metadata = ret->get_metadata(); - ASSERT_EQ(metadata->get_column_name(column_names.size() + 1), "VALUE"); + ASSERT_EQ(metadata->get_column_name(column_names.size() + 1), "value"); bool has_next = false; int cur_line = 0; while (IS_SUCC(table_result_set->next(has_next)) && has_next) { diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index f0fa39b1f..d8671a33c 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -197,6 +197,13 @@ def __init__(self, column_list: List[str], data_types: List[TSDataType]): def set_table_name(self, table_name: str): self.table_name = table_name + def add_column_at(self, index: int, column_name: str, data_type: TSDataType): + """Insert a column and its data type at the given position (0-based index).""" + if index < 0 or index > len(self.column_list): + raise IndexError(f"column index {index} out of range (0 to {len(self.column_list)})") + self.column_list.insert(index, column_name) + self.data_types.insert(index, data_type) + def get_data_type(self, column_index: int) -> TSDataType: if column_index < 1 or column_index > len(self.column_list): raise OverflowError diff --git a/python/tsfile/tsfile_reader.pyx b/python/tsfile/tsfile_reader.pyx index 041764f91..4476d24dc 100644 --- a/python/tsfile/tsfile_reader.pyx +++ b/python/tsfile/tsfile_reader.pyx @@ -19,7 +19,6 @@ #cython: language_level=3 import weakref -from email.contentmanager import raw_data_manager from typing import List import pandas as pd @@ -154,7 +153,6 @@ cdef class ResultSetPy: # Well when we check is null, id from 0, so there index -1. if tsfile_result_set_is_null_by_index(self.result, index): return None - # data type in metadata is an array, id from 0. data_type = self.metadata.get_data_type(index) if data_type == TSDataTypePy.INT32: return tsfile_result_set_get_value_by_index_int32_t(self.result, index) @@ -297,7 +295,7 @@ cdef class TsFileReaderPy: return pyresult def query_table_on_tree(self, column_names : List[str], - start_time : int = INT64_MIN, end_time : int = INT64_MAX) -> ResultSetPy: + start_time : int = INT64_MIN, end_time : int = INT64_MAX) -> ResultSetPy: """ Execute a time range query on specified columns on tree structure. :return: query result handler. diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index 71e213462..4366ef5be 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py @@ -99,6 +99,17 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: _start_time = start_time if start_time is not None else np.iinfo(np.int64).min _end_time = end_time if end_time is not None else np.iinfo(np.int64).max + ## Time column handling (table model): + ## 1. Request has no column list (query all): + ## 1.1 TsFile has a time column in schema: query only non-time columns; then rename + ## the first column of the returned DataFrame to the schema time column name. + ## 1.2 TsFile has no time column in schema: query as-is; first column is "time". + ## 2. Request has a column list but no time column: + ## 2.1 TsFile has a time column in schema: query with requested columns; rename the + ## first column to the schema time column name. + ## 2.2 TsFile has no time column in schema: first column stays "time"; no rename. + ## 3. Request has a column list including the time column: + ## 3.1 Query with requested columns (including time); do not rename the first column. with TsFileReaderPy(file_path) as reader: total_rows = 0 table_schema = reader.get_all_table_schemas() @@ -117,11 +128,17 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: raise TableNotExistError(_table_name) columns = table_schema[_table_name] - column_names_in_file = columns.get_column_names() + column_names_in_file = [] + time_column = None + for column in columns: + if column.get_category() == ColumnCategory.TIME: + time_column = column.get_column_name() + else: + column_names_in_file.append(column.get_column_name()) if _column_names is not None: for column in _column_names: - if column.lower() not in column_names_in_file: + if column.lower() not in column_names_in_file and column.lower() != time_column : raise ColumnNotExistError(column) else: _column_names = column_names_in_file @@ -136,18 +153,21 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: with query_result as result: while result.next(): if max_row_num is None: - df = result.read_data_frame() + dataframe = result.read_data_frame() elif is_iterator: - df = result.read_data_frame(max_row_num) + dataframe = result.read_data_frame(max_row_num) else: remaining_rows = max_row_num - total_rows if remaining_rows <= 0: break - df = result.read_data_frame(remaining_rows) - if df is None or df.empty: + dataframe = result.read_data_frame(remaining_rows) + if dataframe is None or dataframe.empty: continue - total_rows += len(df) - yield df + total_rows += len(dataframe) + if time_column is not None: + if _column_names is None or time_column.lower() not in [c.lower() for c in _column_names]: + dataframe = dataframe.rename(columns={dataframe.columns[0]: time_column}) + yield dataframe if (not is_iterator) and max_row_num is not None and total_rows >= max_row_num: break From 56aa10499bdb0ab113dd49390a84853ee1ee1378 Mon Sep 17 00:00:00 2001 From: ColinLee Date: Thu, 12 Feb 2026 23:16:24 +0800 Subject: [PATCH 13/13] support time column --- python/tests/resources/README.md | 70 +++++++++++++-------- python/tests/test_load_tsfile_from_iotdb.py | 16 ++++- python/tests/test_to_tsfile.py | 4 +- python/tsfile/schema.py | 4 +- python/tsfile/utils.py | 9 ++- 5 files changed, 66 insertions(+), 37 deletions(-) diff --git a/python/tests/resources/README.md b/python/tests/resources/README.md index cd1a2aa04..d5ec82b49 100644 --- a/python/tests/resources/README.md +++ b/python/tests/resources/README.md @@ -287,31 +287,49 @@ Total line number = 40 In `table_with_time_column.tsfile` ``` - time region_id temperature humidity -0 1770729095888 loc 0.1 0.1 -1 1770729096807 loc 0.1 0.1 -2 1770729097233 loc 0.1 0.1 -3 1770729097471 loc 0.1 0.1 -4 1770729097695 loc 0.1 0.1 -5 1770729097910 loc 0.1 0.1 -6 1770729098148 loc 0.1 0.1 -7 1770729098385 loc 0.1 0.1 -8 1770729098599 loc 0.1 0.1 -9 1770729098853 loc 0.1 0.1 -10 1770729099086 loc 0.1 0.1 -11 1770729099327 loc 0.1 0.1 -12 1770729099558 loc 0.1 0.1 -13 1770729099794 loc 0.1 0.1 -14 1770729100017 loc 0.1 0.1 -15 1770729100262 loc 0.1 0.1 -16 1770729100492 loc 0.1 0.1 -17 1770729100729 loc 0.1 0.1 -18 1770729100976 loc 0.1 0.1 -19 1770729101243 loc 0.1 0.1 -20 1770729101494 loc 0.1 0.1 -21 1770729101734 loc 0.1 0.1 -22 1770729102040 loc 0.1 0.1 -23 1770729102333 loc 0.1 0.1 -24 1770729103005 loc 0.1 0.1 +IoTDB:mydb> select * from table2; ++-----------------------------+---------+-----------+--------+ +| id|region_id|temperature|humidity| ++-----------------------------+---------+-----------+--------+ +|2026-02-10T21:11:35.888+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:36.807+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:37.233+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:37.471+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:37.695+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:37.910+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:38.148+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:38.385+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:38.599+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:38.853+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:39.086+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:39.327+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:39.558+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:39.794+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:40.017+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:40.262+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:40.492+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:40.729+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:40.976+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:41.243+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:41.494+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:41.734+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:42.040+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:42.333+08:00| loc| 0.1| 0.1| +|2026-02-10T21:11:43.005+08:00| loc| 0.1| 0.1| ++-----------------------------+---------+-----------+--------+ +Total line number = 25 +It costs 0.042s +IoTDB:mydb> describe table2 ++-----------+---------+--------+ +| ColumnName| DataType|Category| ++-----------+---------+--------+ +| id|TIMESTAMP| TIME| +| region_id| STRING| TAG| +|temperature| FLOAT| FIELD| +| humidity| DOUBLE| FIELD| ++-----------+---------+--------+ +Total line number = 4 +It costs 0.065s +IoTDB:mydb> ``` diff --git a/python/tests/test_load_tsfile_from_iotdb.py b/python/tests/test_load_tsfile_from_iotdb.py index 8dcc0b1c6..50ca0baf4 100644 --- a/python/tests/test_load_tsfile_from_iotdb.py +++ b/python/tests/test_load_tsfile_from_iotdb.py @@ -111,10 +111,24 @@ def test_load_tsfile_from_iotdb(): assert df["s9"].isna().sum() == 5 ## --------- table_with_time_column_path = os.path.join(dir_path, 'table_with_time_column.tsfile') - df = ts.to_dataframe(table_with_time_column_path) + df = ts.to_dataframe(table_with_time_column_path) + assert list(df.columns)[0] == "id" assert len(df) == 25 assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9) assert (df["region_id"] == "loc").sum() == 25 + df = ts.to_dataframe(table_with_time_column_path, table_name="table2", column_names=["region_id", "temperature", "humidity"]) + assert list(df.columns)[0] == "id" + assert len(df) == 25 + assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) + assert (df["region_id"] == "loc").sum() == 25 + + df = ts.to_dataframe(table_with_time_column_path, table_name="table2", column_names=["id", "temperature", "humidity"]) + assert list(df.columns)[0] == "time" + assert df["id"].equals(df["time"]) + assert len(df) == 25 + assert math.isclose(df["temperature"].sum(), 2.5, rel_tol=1e-9) + assert math.isclose(df["humidity"].sum(), 2.5, rel_tol=1e-9) + diff --git a/python/tests/test_to_tsfile.py b/python/tests/test_to_tsfile.py index a35d5e890..4e0481883 100644 --- a/python/tests/test_to_tsfile.py +++ b/python/tests/test_to_tsfile.py @@ -132,11 +132,11 @@ def test_dataframe_to_tsfile_custom_time_column(): dataframe_to_tsfile(df, tsfile_path, table_name="test_table", time_column="timestamp") df_read = to_dataframe(tsfile_path, table_name="test_table") - df_read = df_read.sort_values(TIME_COLUMN).reset_index(drop=True) + df_read = df_read.sort_values("timestamp").reset_index(drop=True) df_sorted = convert_to_nullable_types(df.sort_values('timestamp').reset_index(drop=True)) assert df_read.shape == (30, 3) - assert df_read[TIME_COLUMN].equals(df_sorted["timestamp"]) + assert df_read["timestamp"].equals(df_sorted["timestamp"]) assert df_read["device"].equals(df_sorted["device"]) assert df_read["value"].equals(df_sorted["value"]) finally: diff --git a/python/tsfile/schema.py b/python/tsfile/schema.py index d8671a33c..c89649bf3 100644 --- a/python/tsfile/schema.py +++ b/python/tsfile/schema.py @@ -119,7 +119,7 @@ def __init__(self, table_name: str, columns: List[ColumnSchema]): self.table_name = table_name.lower() if len(columns) == 0: raise ValueError("Columns cannot be empty") - self.columns = [] + self.columns = columns for column in columns: if column.get_category() == ColumnCategory.TIME: if self.time_column is not None: @@ -128,8 +128,6 @@ def __init__(self, table_name: str, columns: List[ColumnSchema]): f"'{self.time_column.get_column_name()}' and '{column.get_column_name()}'" ) self.time_column = column - else: - self.columns.append(column) def get_table_name(self): return self.table_name diff --git a/python/tsfile/utils.py b/python/tsfile/utils.py index 4366ef5be..6044ddbb6 100644 --- a/python/tsfile/utils.py +++ b/python/tsfile/utils.py @@ -115,22 +115,21 @@ def _gen(is_iterator: bool) -> Iterator[pd.DataFrame]: table_schema = reader.get_all_table_schemas() is_tree_model = len(table_schema) == 0 - + time_column = None if is_tree_model: if _column_names is None: print("columns name is None, return all columns") else: if _table_name is None: - _table_name, columns = next(iter(table_schema.items())) + _table_name, table_schema = next(iter(table_schema.items())) else: _table_name = _table_name.lower() if _table_name.lower() not in table_schema: raise TableNotExistError(_table_name) - columns = table_schema[_table_name] + table_schema = table_schema[_table_name] column_names_in_file = [] - time_column = None - for column in columns: + for column in table_schema.get_columns(): if column.get_category() == ColumnCategory.TIME: time_column = column.get_column_name() else: