Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1548,3 +1548,72 @@ log.tools.csv_read_error = Error reading CSV file: %1$s

# CsvSourceReader — error closing CSV reader
log.tools.csv_close_reader_error = Error closing CSV reader

# HybridCsvTsFileAssembler — could not delete existing output
log.tools.hybrid_delete_output_failed = Could not delete existing output file: {}

# HybridCsvTsFileAssembler — writing main CSV
log.tools.hybrid_writing_main_csv = Writing main CSV: {}

# HybridCsvTsFileAssembler — writing supplement CSV
log.tools.hybrid_writing_supplement_csv = Writing supplement CSV: {} (batch_id={}, starting id={})

# HybridCsvTsFileAssembler — too many supplement rows
error.tools.hybrid_supplement_too_many_rows = Too many rows in one supplement CSV: %1$s

# HybridImportConfigParser — supplement_batch_id order
error.tools.hybrid_supplement_batch_id_order = supplement_batch_id must follow supplement_csv in config file

# HybridImportConfigParser — unknown config line
error.tools.hybrid_unknown_config_line = Unknown config line: %1$s

# HybridImportConfigParser — supplement_csv without batch id
error.tools.hybrid_supplement_csv_without_batch_id = supplement_csv without matching supplement_batch_id: %1$s

# HybridImportConfigParser — required output_tsfile
error.tools.hybrid_output_tsfile_required = output_tsfile is required

# HybridImportConfigParser — required shared_schema
error.tools.hybrid_shared_schema_required = shared_schema is required

# HybridImportConfigParser — required main_csv
error.tools.hybrid_main_csv_required = main_csv is required

# HybridImportConfigParser — main_csv not found
error.tools.hybrid_main_csv_not_found = main_csv file not found: %1$s

# HybridImportConfigParser — shared_schema not found
error.tools.hybrid_shared_schema_not_found = shared_schema file not found: %1$s

# HybridImportConfigParser — supplement_csv not found
error.tools.hybrid_supplement_csv_not_found = supplement_csv file not found: %1$s

# HybridImportConfigParser — invalid config line
error.tools.hybrid_invalid_config_line = Invalid config line: %1$s

# SyntheticTabletBuilder — timestamps length mismatch
error.tools.hybrid_timestamps_length_mismatch = timestamps length %1$s != row count %2$s

# SyntheticTabletBuilder — uniform TAG violation
error.tools.hybrid_uniform_tags_violation = Supplement CSV must have a single business TAG combination per file when validate_uniform_tags is enabled. Rows %1$s and %2$s map to different devices: %3$s vs %4$s

# SupplementCsvSourceReader — inferSchema unsupported
error.tools.hybrid_infer_schema_unsupported = inferSchema() is not supported for supplement CSV reader

# SupplementCsvSourceReader — header required
error.tools.hybrid_supplement_header_required = Supplement CSV requires has_header=true in schema: %1$s

# SupplementCsvSourceReader — unexpected header column
error.tools.hybrid_supplement_unexpected_column = Unexpected column in supplement CSV header: %1$s

# SupplementCsvSourceReader — missing header column
error.tools.hybrid_supplement_missing_column = Missing column in supplement CSV header: %1$s

# SupplementVarianceSorter — FIELD column not in batch
error.tools.hybrid_field_column_not_in_batch = FIELD column '%1$s' not found in supplement batch columns

# SupplementVarianceSorter — variance sort priority log
log.tools.hybrid_variance_sort_priority = Supplement FIELD sort priority (variance desc): {}

# TsFileTool — hybrid import failed
log.tools.hybrid_import_failed = Hybrid import failed for config: %1$s
Original file line number Diff line number Diff line change
Expand Up @@ -1548,3 +1548,72 @@ log.tools.csv_read_error = 读取 CSV 文件出错: %1$s

# CsvSourceReader — error closing CSV reader
log.tools.csv_close_reader_error = 关闭 CSV reader 出错

# HybridCsvTsFileAssembler — could not delete existing output
log.tools.hybrid_delete_output_failed = 无法删除已存在的输出文件: {}

# HybridCsvTsFileAssembler — writing main CSV
log.tools.hybrid_writing_main_csv = 正在写入主 CSV: {}

# HybridCsvTsFileAssembler — writing supplement CSV
log.tools.hybrid_writing_supplement_csv = 正在写入附属 CSV: {} (batch_id={}, starting id={})

# HybridCsvTsFileAssembler — too many supplement rows
error.tools.hybrid_supplement_too_many_rows = 单个附属 CSV 行数过多: %1$s

# HybridImportConfigParser — supplement_batch_id order
error.tools.hybrid_supplement_batch_id_order = supplement_batch_id 必须紧跟在 supplement_csv 之后

# HybridImportConfigParser — unknown config line
error.tools.hybrid_unknown_config_line = 未知配置行: %1$s

# HybridImportConfigParser — supplement_csv without batch id
error.tools.hybrid_supplement_csv_without_batch_id = supplement_csv 缺少对应的 supplement_batch_id: %1$s

# HybridImportConfigParser — required output_tsfile
error.tools.hybrid_output_tsfile_required = 必须配置 output_tsfile

# HybridImportConfigParser — required shared_schema
error.tools.hybrid_shared_schema_required = 必须配置 shared_schema

# HybridImportConfigParser — required main_csv
error.tools.hybrid_main_csv_required = 必须配置 main_csv

# HybridImportConfigParser — main_csv not found
error.tools.hybrid_main_csv_not_found = 找不到 main_csv 文件: %1$s

# HybridImportConfigParser — shared_schema not found
error.tools.hybrid_shared_schema_not_found = 找不到 shared_schema 文件: %1$s

# HybridImportConfigParser — supplement_csv not found
error.tools.hybrid_supplement_csv_not_found = 找不到 supplement_csv 文件: %1$s

# HybridImportConfigParser — invalid config line
error.tools.hybrid_invalid_config_line = 无效配置行: %1$s

# SyntheticTabletBuilder — timestamps length mismatch
error.tools.hybrid_timestamps_length_mismatch = timestamps 长度 %1$s 与行数 %2$s 不一致

# SyntheticTabletBuilder — uniform TAG violation
error.tools.hybrid_uniform_tags_violation = 启用 validate_uniform_tags 时,每个附属 CSV 只能有一种业务 TAG 组合。第 %1$s 行与第 %2$s 行对应不同 device: %3$s vs %4$s

# SupplementCsvSourceReader — inferSchema unsupported
error.tools.hybrid_infer_schema_unsupported = supplement CSV reader 不支持 inferSchema()

# SupplementCsvSourceReader — header required
error.tools.hybrid_supplement_header_required = 附属 CSV 的 schema 必须设置 has_header=true: %1$s

# SupplementCsvSourceReader — unexpected header column
error.tools.hybrid_supplement_unexpected_column = 附属 CSV 表头出现未预期的列: %1$s

# SupplementCsvSourceReader — missing header column
error.tools.hybrid_supplement_missing_column = 附属 CSV 表头缺少列: %1$s

# SupplementVarianceSorter — FIELD column not in batch
error.tools.hybrid_field_column_not_in_batch = 附属 batch 中找不到 FIELD 列 '%1$s'

# SupplementVarianceSorter — variance sort priority log
log.tools.hybrid_variance_sort_priority = 附属 FIELD 排序优先级(方差降序): {}

# TsFileTool — hybrid import failed
log.tools.hybrid_import_failed = 混合导入失败,配置文件: %1$s
38 changes: 38 additions & 0 deletions java/tools/README-zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,44 @@ mvn clean package -P with-java -DskipTests
mvn install -P with-java -DskipTests
```

## 混合 CSV 导入

将一条带真实时间列的主时序 CSV,与多条不含时间列的附属 CSV(列与主表 TAG/FIELD 相同)合并写入**单个** TsFile。附属行使用合成时间戳 `1, 2, …, N`(按文件内连续);通过虚拟 TAG `batch_id` 隔离,每个附属文件对应一个 ChunkGroup(在同一业务 TAG 组合下)。

配置示例(`hybrid.conf`):

```
output_tsfile=combined.tsfile
shared_schema=main.schema
main_csv=timeseries.csv
main_batch_id=main
batch_id_tag=batch_id
validate_uniform_tags=true
supplement_sort_by_variance=true
supplement_csv=experiment_1.csv
supplement_batch_id=experiment_1
supplement_csv=experiment_2.csv
supplement_batch_id=experiment_2
```

运行:

```sh
java -jar tsfile-tools.jar --hybrid_config hybrid.conf
```

附属 CSV 表头须包含 `shared_schema` 中除时间列外的全部业务 TAG 与 FIELD 列(例如 `Region,DeviceId,Temperature,Pressure`)。

对每个附属 CSV 单独处理(默认 `supplement_sort_by_variance=true`):

1. 仅在该 CSV 内计算各 **FIELD** 列方差。
2. 按方差降序确定列排序优先级。
3. 对该 CSV 行做升序多键排序。
4. 写入一个 ChunkGroup;组内时间戳连续(`startId`, `startId+1`, …)。
5. 下一个附属文件从 `maxId + 1` 继续编号(file1: `1..n1`,file2: `n1+1..n1+n2`,…)。

编程接口:`HybridCsvTsFileAssembler.execute(HybridImportConfig)`。

## schema 定义

### 参数
Expand Down
38 changes: 38 additions & 0 deletions java/tools/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,44 @@ mvn clean package -P with-java -DskipTests
mvn install -P with-java -DskipTests
```

## Hybrid CSV Import

Combine one main time-series CSV (with a real time column) and multiple supplement CSVs (same TAG/FIELD columns, **no** time column) into a **single** TsFile. Supplement rows receive synthetic timestamps `1, 2, …, N` per file; each file is isolated with a virtual TAG `batch_id` (one ChunkGroup per file per business TAG combination).

Example config (`hybrid.conf`):

```
output_tsfile=combined.tsfile
shared_schema=main.schema
main_csv=timeseries.csv
main_batch_id=main
batch_id_tag=batch_id
validate_uniform_tags=true
supplement_sort_by_variance=true
supplement_csv=experiment_1.csv
supplement_batch_id=experiment_1
supplement_csv=experiment_2.csv
supplement_batch_id=experiment_2
```

Run:

```sh
java -jar tsfile-tools.jar --hybrid_config hybrid.conf
```

Supplement CSV headers must list all business TAG and FIELD columns from `shared_schema`, excluding the time column (e.g. `Region,DeviceId,Temperature,Pressure`).

For each supplement CSV separately (`supplement_sort_by_variance=true` by default):

1. Compute variance of each **FIELD** column **within that CSV only**.
2. Order columns by variance descending (higher variance = higher sort priority).
3. Sort rows in that CSV ascending (multi-key comparator).
4. Write one ChunkGroup per CSV; timestamps are **consecutive** inside the group (`startId`, `startId+1`, …).
5. The next supplement CSV continues ids from `maxId + 1` (file1: `1..n1`, file2: `n1+1..n1+n2`, …).

Programmatic API: `HybridCsvTsFileAssembler.execute(HybridImportConfig)`.

## Schema Definition

### Parameters
Expand Down
Loading