diff --git a/.github/actions/test_unit/action.yml b/.github/actions/test_unit/action.yml index adb8fc5719b4..4e32a581f650 100644 --- a/.github/actions/test_unit/action.yml +++ b/.github/actions/test_unit/action.yml @@ -16,7 +16,7 @@ runs: RUST_TEST_THREADS: "8" RUST_LOG: ERROR RUST_MIN_STACK: 104857600 - # RUST_BACKTRACE: full + # RUST_BACKTRACE: 1 - name: Upload failure if: failure() diff --git a/Cargo.lock b/Cargo.lock index 0ddf8d4575e1..bfe4ce1cca8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3338,6 +3338,7 @@ dependencies = [ "dyn-clone", "goldenfile", "log", + "maplit", "parking_lot 0.12.3", "parquet", "rand 0.8.5", @@ -4276,14 +4277,15 @@ dependencies = [ "databend-common-catalog", "databend-common-exception", "databend-common-expression", + "databend-common-functions", "databend-common-meta-app", "databend-common-pipeline-core", "databend-common-storage", "databend-common-storages-parquet", + "databend-storages-common-pruner", "databend-storages-common-table-meta", "deltalake", "fastrace 0.7.2", - "maplit", "match-template", "object_store_opendal", "parquet", @@ -4391,7 +4393,6 @@ dependencies = [ "async-recursion", "async-trait", "chrono", - "databend-common-arrow", "databend-common-base", "databend-common-catalog", "databend-common-config", @@ -4405,8 +4406,8 @@ dependencies = [ "databend-common-pipeline-sources", "databend-common-sql", "databend-common-storage", - "databend-storages-common-cache", - "databend-storages-common-index", + "databend-common-storages-parquet", + "databend-storages-common-pruner", "databend-storages-common-table-meta", "fastrace 0.7.2", "faststr", @@ -4414,6 +4415,7 @@ dependencies = [ "hive_metastore", "log", "opendal 0.49.0", + "parquet", "recursive", "serde", "typetag", @@ -4556,6 +4558,7 @@ dependencies = [ "databend-common-settings", "databend-common-sql", "databend-common-storage", + "databend-storages-common-cache", "databend-storages-common-pruner", "databend-storages-common-stage", "databend-storages-common-table-meta", @@ -5395,6 +5398,7 @@ dependencies = [ "hex", "log", "parking_lot 0.12.3", + "parquet", "rayon", "rustix 0.38.37", "siphasher", diff --git a/src/query/catalog/Cargo.toml b/src/query/catalog/Cargo.toml index 922568ad019f..4f1e03024541 100644 --- a/src/query/catalog/Cargo.toml +++ b/src/query/catalog/Cargo.toml @@ -47,6 +47,7 @@ xorf = { version = "0.11.0", default-features = false, features = ["binary-fuse" [dev-dependencies] goldenfile = "1.4" +maplit = "1.0.2" [lints] workspace = true diff --git a/src/query/catalog/src/lib.rs b/src/query/catalog/src/lib.rs index 55c54b31dd96..722d088441b6 100644 --- a/src/query/catalog/src/lib.rs +++ b/src/query/catalog/src/lib.rs @@ -20,6 +20,7 @@ pub mod cluster_info; pub mod database; pub mod lock; pub mod merge_into_join; +pub mod partition_columns; pub mod plan; pub mod query_kind; pub mod runtime_filter_info; diff --git a/src/query/storages/delta/src/partition_columns/mod.rs b/src/query/catalog/src/partition_columns/mod.rs similarity index 94% rename from src/query/storages/delta/src/partition_columns/mod.rs rename to src/query/catalog/src/partition_columns/mod.rs index 1478737f43b1..cb7698f75136 100644 --- a/src/query/storages/delta/src/partition_columns/mod.rs +++ b/src/query/catalog/src/partition_columns/mod.rs @@ -16,4 +16,4 @@ mod pushdown_transform; mod values_serde; pub use pushdown_transform::get_pushdown_without_partition_columns; -pub use values_serde::get_partition_values; +pub use values_serde::str_to_scalar; diff --git a/src/query/storages/delta/src/partition_columns/pushdown_transform.rs b/src/query/catalog/src/partition_columns/pushdown_transform.rs similarity index 96% rename from src/query/storages/delta/src/partition_columns/pushdown_transform.rs rename to src/query/catalog/src/partition_columns/pushdown_transform.rs index 489a122f0539..f95db01e446a 100644 --- a/src/query/storages/delta/src/partition_columns/pushdown_transform.rs +++ b/src/query/catalog/src/partition_columns/pushdown_transform.rs @@ -14,12 +14,13 @@ use std::collections::BTreeMap; -use databend_common_catalog::plan::Projection; -use databend_common_catalog::plan::PushDownInfo; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::FieldIndex; +use crate::plan::Projection; +use crate::plan::PushDownInfo; + pub fn get_pushdown_without_partition_columns( mut pushdown: PushDownInfo, partition_columns: &[FieldIndex], @@ -87,10 +88,9 @@ fn shift_projection(prj: Projection, partition_columns: &[FieldIndex]) -> Result #[cfg(test)] mod tests { - use databend_common_catalog::plan::Projection; - use super::shift_projection; use super::shift_projection_index; + use crate::plan::Projection; #[test] fn test_shift_projection_index() { diff --git a/src/query/storages/delta/src/partition_columns/values_serde.rs b/src/query/catalog/src/partition_columns/values_serde.rs similarity index 83% rename from src/query/storages/delta/src/partition_columns/values_serde.rs rename to src/query/catalog/src/partition_columns/values_serde.rs index e283b0f4338e..005c48eed91d 100644 --- a/src/query/storages/delta/src/partition_columns/values_serde.rs +++ b/src/query/catalog/src/partition_columns/values_serde.rs @@ -21,8 +21,6 @@ use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberScalar; use databend_common_expression::Scalar; -use databend_common_expression::TableField; -use deltalake::kernel::Add; pub fn str_to_scalar(value: &str, data_type: &DataType) -> Result { if value.is_empty() { @@ -81,20 +79,3 @@ pub fn str_to_scalar(value: &str, data_type: &DataType) -> Result { ))), } } - -pub fn get_partition_values(add: &Add, fields: &[&TableField]) -> Result> { - let mut values = Vec::with_capacity(fields.len()); - for f in fields { - match add.partition_values.get(&f.name) { - Some(Some(v)) => values.push(str_to_scalar(v, &f.data_type().into())?), - Some(None) => values.push(Scalar::Null), - None => { - return Err(ErrorCode::BadArguments(format!( - "partition value for column {} not found", - &f.name - ))); - } - } - } - Ok(values) -} diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index ab1046e115c0..5cedd0c0d59c 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -2940,11 +2940,7 @@ pub struct DiskCacheConfig { #[serde(default, deny_unknown_fields)] pub struct SpillConfig { /// Path of spill to local disk. disable if it's empty. - #[clap( - long, - value_name = "VALUE", - default_value = "./.databend/temp/_query_spill" - )] + #[clap(long, value_name = "VALUE", default_value = "")] pub spill_local_disk_path: OsString, #[clap(long, value_name = "VALUE", default_value = "30")] diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index fb4a32c8afa2..7f0df3db978a 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -723,7 +723,7 @@ pub struct SpillConfig { impl Default for SpillConfig { fn default() -> Self { Self { - path: OsString::from("./.databend/temp/_query_spill"), + path: OsString::from(""), reserved_disk_ratio: OrderedFloat(0.3), global_bytes_limit: u64::MAX, } diff --git a/src/query/service/src/locks/lock_holder.rs b/src/query/service/src/locks/lock_holder.rs index 9f77de14a76d..5b292a63c274 100644 --- a/src/query/service/src/locks/lock_holder.rs +++ b/src/query/service/src/locks/lock_holder.rs @@ -20,19 +20,29 @@ use std::time::Instant; use backoff::backoff::Backoff; use databend_common_base::base::tokio::time::sleep; +use databend_common_base::base::tokio::time::timeout; use databend_common_base::base::WatchNotify; use databend_common_base::runtime::GlobalIORuntime; use databend_common_base::runtime::TrySpawn; use databend_common_catalog::catalog::Catalog; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_meta_api::kv_pb_api::KVPbApi; use databend_common_meta_app::schema::CreateLockRevReq; use databend_common_meta_app::schema::DeleteLockRevReq; use databend_common_meta_app::schema::ExtendLockRevReq; +use databend_common_meta_app::schema::ListLockRevReq; +use databend_common_meta_app::schema::TableLockIdent; +use databend_common_meta_kvapi::kvapi::Key; +use databend_common_meta_types::protobuf::watch_request::FilterType; +use databend_common_meta_types::protobuf::WatchRequest; +use databend_common_metrics::lock::record_acquired_lock_nums; use databend_common_metrics::lock::record_created_lock_nums; use databend_common_storages_fuse::operations::set_backoff; +use databend_common_users::UserApiProvider; use futures::future::select; use futures::future::Either; +use futures_util::StreamExt; use rand::thread_rng; use rand::Rng; @@ -46,13 +56,120 @@ pub struct LockHolder { impl LockHolder { #[async_backtrace::framed] - pub async fn start( + pub(crate) async fn try_acquire_lock( + self: &Arc, + catalog: Arc, + req: CreateLockRevReq, + should_retry: bool, + acquire_timeout: Duration, + ) -> Result { + let start = Instant::now(); + + let ttl = req.ttl; + + let lock_key = req.lock_key.clone(); + let lock_type = lock_key.lock_type().to_string(); + let table_id = lock_key.get_table_id(); + let tenant = lock_key.get_tenant(); + + let revision = self.start(catalog.clone(), req).await?; + + let meta_api = UserApiProvider::instance().get_meta_store_client(); + let list_table_lock_req = ListLockRevReq::new(lock_key.clone()); + + loop { + // List all revisions and check if the current is the minimum. + let mut rev_list = catalog + .list_lock_revisions(list_table_lock_req.clone()) + .await? + .into_iter() + .map(|(x, _)| x) + .collect::>(); + // list_lock_revisions are returned in big-endian order, + // we need to sort them in ascending numeric order. + rev_list.sort(); + let position = rev_list.iter().position(|x| *x == revision).ok_or_else(|| + // If the current is not found in list, it means that the current has been expired. + ErrorCode::TableLockExpired(format!( + "The acquired table lock with revision '{}' maybe expired(elapsed: {:?})", + revision, + start.elapsed(), + )))?; + + if position == 0 { + // The lock is acquired by current session. + let extend_table_lock_req = + ExtendLockRevReq::new(lock_key.clone(), revision, ttl, true); + + catalog.extend_lock_revision(extend_table_lock_req).await?; + // metrics. + record_acquired_lock_nums(lock_type, table_id, 1); + break; + } + + let prev_revision = rev_list[position - 1]; + let elapsed = start.elapsed(); + // if no need retry, return error directly. + if !should_retry || elapsed >= acquire_timeout { + return Err(ErrorCode::TableAlreadyLocked(format!( + "Table is locked by other session(rev: {}, prev: {}, elapsed: {:?})", + revision, + prev_revision, + start.elapsed() + ))); + } + + let watch_delete_ident = TableLockIdent::new(tenant, table_id, prev_revision); + + // Get the previous revision, watch the delete event. + let req = WatchRequest { + key: watch_delete_ident.to_string_key(), + key_end: None, + filter_type: FilterType::Delete.into(), + }; + let mut watch_stream = meta_api.watch(req).await?; + + let lock_meta = meta_api.get_pb(&watch_delete_ident).await?; + if lock_meta.is_none() { + log::warn!( + "Lock revision '{}' already does not exist, skipping", + prev_revision + ); + continue; + } + + // Add a timeout period for watch. + if let Err(_cause) = timeout(acquire_timeout.abs_diff(elapsed), async move { + while let Some(Ok(resp)) = watch_stream.next().await { + if let Some(event) = resp.event { + if event.current.is_none() { + break; + } + } + } + }) + .await + { + return Err(ErrorCode::TableAlreadyLocked(format!( + "Table is locked by other session(rev: {}, prev: {}, elapsed: {:?})", + revision, + prev_revision, + start.elapsed() + ))); + } + } + + Ok(revision) + } + + #[async_backtrace::framed] + async fn start( self: &Arc, - query_id: String, catalog: Arc, req: CreateLockRevReq, ) -> Result { let lock_key = req.lock_key.clone(); + let query_id = req.query_id.clone(); let ttl = req.ttl; let sleep_range = (ttl / 3)..=(ttl * 2 / 3); @@ -61,6 +178,7 @@ impl LockHolder { let revision = res.revision; // metrics. record_created_lock_nums(lock_key.lock_type().to_string(), lock_key.get_table_id(), 1); + log::debug!("create table lock success, revision={}", revision); let delete_table_lock_req = DeleteLockRevReq::new(lock_key.clone(), revision); let extend_table_lock_req = ExtendLockRevReq::new(lock_key.clone(), revision, ttl, false); @@ -179,7 +297,10 @@ impl LockHolder { let mut backoff = set_backoff(Some(Duration::from_millis(2)), None, max_retry_elapsed); loop { match catalog.delete_lock_revision(req.clone()).await { - Ok(_) => break, + Ok(_) => { + log::debug!("delete table lock success, revision={}", req.revision); + break; + } Err(e) => match backoff.next_backoff() { Some(duration) => { log::debug!( diff --git a/src/query/service/src/locks/lock_manager.rs b/src/query/service/src/locks/lock_manager.rs index e1b86aa0f1c2..7bd00139c58d 100644 --- a/src/query/service/src/locks/lock_manager.rs +++ b/src/query/service/src/locks/lock_manager.rs @@ -15,35 +15,21 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use std::time::Instant; use databend_common_base::base::tokio::sync::mpsc; -use databend_common_base::base::tokio::time::timeout; use databend_common_base::base::GlobalInstance; use databend_common_base::runtime::GlobalIORuntime; use databend_common_base::runtime::TrySpawn; use databend_common_catalog::lock::Lock; use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_meta_api::kv_pb_api::KVPbApi; use databend_common_meta_app::schema::CreateLockRevReq; -use databend_common_meta_app::schema::DeleteLockRevReq; -use databend_common_meta_app::schema::ExtendLockRevReq; -use databend_common_meta_app::schema::ListLockRevReq; use databend_common_meta_app::schema::LockKey; use databend_common_meta_app::schema::TableInfo; -use databend_common_meta_app::schema::TableLockIdent; -use databend_common_meta_kvapi::kvapi::Key; -use databend_common_meta_types::protobuf::watch_request::FilterType; -use databend_common_meta_types::protobuf::WatchRequest; use databend_common_metrics::lock::metrics_inc_shutdown_lock_holder_nums; use databend_common_metrics::lock::metrics_inc_start_lock_holder_nums; -use databend_common_metrics::lock::record_acquired_lock_nums; use databend_common_pipeline_core::LockGuard; use databend_common_pipeline_core::UnlockApi; -use databend_common_users::UserApiProvider; -use futures_util::StreamExt; use parking_lot::RwLock; use crate::locks::lock_holder::LockHolder; @@ -97,129 +83,34 @@ impl LockManager { catalog_name: &str, should_retry: bool, ) -> Result>> { - let start = Instant::now(); + let acquire_timeout = Duration::from_secs(ctx.get_settings().get_acquire_lock_timeout()?); - let lock_type = lock_key.lock_type().to_string(); - let table_id = lock_key.get_table_id(); - let tenant = lock_key.get_tenant(); - let expire_secs = ctx.get_settings().get_table_lock_expire_secs()?; - let query_id = ctx.get_id(); + let ttl = Duration::from_secs(ctx.get_settings().get_table_lock_expire_secs()?); let req = CreateLockRevReq::new( - lock_key.clone(), + lock_key, ctx.get_current_user()?.name, // user ctx.get_cluster().local_id.clone(), // node - query_id.clone(), // query_id - Duration::from_secs(expire_secs), + ctx.get_id(), // query_id + ttl, ); let catalog = ctx.get_catalog(catalog_name).await?; let lock_holder = Arc::new(LockHolder::default()); - let revision = lock_holder.start(query_id, catalog.clone(), req).await?; - - self.insert_lock(revision, lock_holder); - let guard = LockGuard::new(self.clone(), revision); - - let acquire_lock_timeout = ctx.get_settings().get_acquire_lock_timeout()?; - let duration = Duration::from_secs(acquire_lock_timeout); - let meta_api = UserApiProvider::instance().get_meta_store_client(); - - let list_table_lock_req = ListLockRevReq::new(lock_key.clone()); - - let delete_table_lock_req = DeleteLockRevReq::new(lock_key.clone(), revision); - - loop { - // List all revisions and check if the current is the minimum. - let mut rev_list = catalog - .list_lock_revisions(list_table_lock_req.clone()) - .await? - .into_iter() - .map(|(x, _)| x) - .collect::>(); - // list_lock_revisions are returned in big-endian order, - // we need to sort them in ascending numeric order. - rev_list.sort(); - let position = rev_list.iter().position(|x| *x == revision).ok_or_else(|| - // If the current is not found in list, it means that the current has been expired. - ErrorCode::TableLockExpired(format!( - "the acquired table lock with revision '{}' is not in {:?}, maybe expired(elapsed: {:?})", - revision, - rev_list, - start.elapsed(), - )))?; - - if position == 0 { - // The lock is acquired by current session. - let extend_table_lock_req = ExtendLockRevReq::new( - lock_key.clone(), - revision, - Duration::from_secs(expire_secs), - true, - ); - - catalog.extend_lock_revision(extend_table_lock_req).await?; - // metrics. - record_acquired_lock_nums(lock_type, table_id, 1); - break; + match lock_holder + .try_acquire_lock(catalog, req, should_retry, acquire_timeout) + .await + { + Ok(revision) => { + self.insert_lock(revision, lock_holder); + let guard = LockGuard::new(self.clone(), revision); + Ok(Some(Arc::new(guard))) } - - let elapsed = start.elapsed(); - // if no need retry, return error directly. - if !should_retry || elapsed >= duration { - catalog - .delete_lock_revision(delete_table_lock_req.clone()) - .await?; - return Err(ErrorCode::TableAlreadyLocked(format!( - "table is locked by other session, please retry later(elapsed: {:?})", - elapsed - ))); + Err(err) => { + lock_holder.shutdown(); + Err(err) } - - let watch_delete_ident = TableLockIdent::new(tenant, table_id, rev_list[position - 1]); - - // Get the previous revision, watch the delete event. - let req = WatchRequest { - key: watch_delete_ident.to_string_key(), - key_end: None, - filter_type: FilterType::Delete.into(), - }; - let mut watch_stream = meta_api.watch(req).await?; - - let lock_meta = meta_api.get_pb(&watch_delete_ident).await?; - if lock_meta.is_none() { - log::warn!( - "Lock revision '{}' already does not exist, skipping", - rev_list[position - 1] - ); - continue; - } - - // Add a timeout period for watch. - match timeout(duration.abs_diff(elapsed), async move { - while let Some(Ok(resp)) = watch_stream.next().await { - if let Some(event) = resp.event { - if event.current.is_none() { - break; - } - } - } - }) - .await - { - Ok(_) => Ok(()), - Err(_) => { - catalog - .delete_lock_revision(delete_table_lock_req.clone()) - .await?; - Err(ErrorCode::TableAlreadyLocked(format!( - "table is locked by other session, please retry later(elapsed: {:?})", - start.elapsed() - ))) - } - }?; } - - Ok(Some(Arc::new(guard))) } fn insert_lock(&self, revision: u64, lock_holder: Arc) { diff --git a/src/query/service/tests/it/parquet_rs/data.rs b/src/query/service/tests/it/parquet_rs/data.rs index 22e1fa57a535..28d3f8354aef 100644 --- a/src/query/service/tests/it/parquet_rs/data.rs +++ b/src/query/service/tests/it/parquet_rs/data.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::os::unix::fs::PermissionsExt; use std::sync::Arc; use arrow_array::Array; @@ -36,6 +37,7 @@ use chrono::Duration; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; +use tokio::fs::create_dir_all; // Test cases from apache/arrow-datafusion @@ -336,10 +338,13 @@ fn create_data_batch(scenario: Scenario) -> Vec { /// Create a test parquet file with various data types pub async fn make_test_file_rg(scenario: Scenario) -> (NamedTempFile, SchemaRef) { + let dir = std::env::temp_dir().join("parquets_rg"); + create_dir_all(&dir).await.unwrap(); let mut output_file = tempfile::Builder::new() .prefix("parquet_pruning") .suffix(".parquet") - .tempfile() + .permissions(std::fs::Permissions::from_mode(0o666)) + .tempfile_in(dir) .expect("tempfile creation"); let props = WriterProperties::builder() @@ -362,10 +367,13 @@ pub async fn make_test_file_rg(scenario: Scenario) -> (NamedTempFile, SchemaRef) } pub async fn make_test_file_page(scenario: Scenario) -> (NamedTempFile, SchemaRef) { + let dir = std::env::temp_dir().join("parquets_page"); + create_dir_all(&dir).await.unwrap(); let mut output_file = tempfile::Builder::new() .prefix("parquet_page_pruning") .suffix(".parquet") - .tempfile() + .permissions(std::fs::Permissions::from_mode(0o666)) + .tempfile_in(dir) .expect("tempfile creation"); // set row count to 5, should get same result as rowGroup diff --git a/src/query/service/tests/it/parquet_rs/prune_pages.rs b/src/query/service/tests/it/parquet_rs/prune_pages.rs index 464dcab5cf95..41f2f1cb6933 100644 --- a/src/query/service/tests/it/parquet_rs/prune_pages.rs +++ b/src/query/service/tests/it/parquet_rs/prune_pages.rs @@ -29,424 +29,334 @@ use crate::parquet_rs::data::Scenario; use crate::parquet_rs::utils::create_parquet_test_fixture; use crate::parquet_rs::utils::get_data_source_plan; -async fn test(scenario: Scenario, predicate: &str, expected_selection: RowSelection) { - let (file, arrow_schema) = make_test_file_page(scenario).await; - let file_path = file.path().to_string_lossy(); - let sql = format!("select * from 'fs://{file_path}' where {predicate}"); - +async fn test_batch(batches: &[(Scenario, &str, RowSelection)]) { let fixture = create_parquet_test_fixture().await; - let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) - .await + for (scenario, predicate, expected_selection) in batches { + let (file, arrow_schema) = make_test_file_page(*scenario).await; + let file_path = file.path().to_string_lossy(); + let sql = format!("select * from 'fs://{file_path}' where {predicate}"); + + let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) + .await + .unwrap(); + let metadata = ArrowReaderMetadata::load( + file.as_file(), + ArrowReaderOptions::new() + .with_page_index(true) + .with_skip_arrow_metadata(true), + ) .unwrap(); - let metadata = ArrowReaderMetadata::load( - file.as_file(), - ArrowReaderOptions::new() - .with_page_index(true) - .with_skip_arrow_metadata(true), - ) - .unwrap(); - let parquet_meta = metadata.metadata(); - let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); - let leaf_fields = Arc::new(schema.leaf_fields()); - - let pruner = ParquetRSPruner::try_create( - FunctionContext::default(), - Arc::new(schema), - leaf_fields, - &plan.push_downs, - ParquetReadOptions::default() - .with_prune_row_groups(false) - .with_prune_pages(true), - vec![], - ) - .unwrap(); - - let row_groups = (0..parquet_meta.num_row_groups()).collect::>(); - let selection = pruner - .prune_pages(parquet_meta, &row_groups, None) - .unwrap() + let parquet_meta = metadata.metadata(); + let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); + let leaf_fields = Arc::new(schema.leaf_fields()); + + let pruner = ParquetRSPruner::try_create( + FunctionContext::default(), + Arc::new(schema), + leaf_fields, + &plan.push_downs, + ParquetReadOptions::default() + .with_prune_row_groups(false) + .with_prune_pages(true), + vec![], + ) .unwrap(); - assert_eq!( - expected_selection, selection, - "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", - expected_selection, selection, scenario, predicate - ); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 1 2020-01-01T01:01:01.000000 2020-01-02T01:01:01.000000 -// page-1 1 2020-01-01T01:01:11.000000 2020-01-02T01:01:11.000000 -// page-2 1 2020-01-01T01:11:01.000000 2020-01-02T01:11:01.000000 -// page-3 1 2020-01-11T01:01:01.000000 2020-01-12T01:01:01.000000 -async fn test_timestamp() { - test( - Scenario::Timestamp, - "micros < to_timestamp('2020-01-02 01:01:11Z')", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 1 2020-01-01 2020-01-04 -// page-1 1 2020-01-11 2020-01-14 -// page-2 1 2020-10-27 2020-10-30 -// page-3 1 2029-11-09 2029-11-12 -async fn test_date() { - test( - Scenario::Date, - "date32 < to_date('2020-01-02')", - RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(15)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 0 -5 -1 -// page-1 0 -4 0 -// page-2 0 0 4 -// page-3 0 5 9 -async fn test_int32_lt() { - test( - Scenario::Int32, - "i < 1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; - // result of sql "SELECT * FROM t where i < 1" is same as - // "SELECT * FROM t where -i > -1" - test( - Scenario::Int32, - "-i > -1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_gt() { - test( - Scenario::Int32, - "i > 8", - RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), - ) - .await; - - test( - Scenario::Int32, - "-i < -8", - RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), - ) - .await; + let row_groups = (0..parquet_meta.num_row_groups()).collect::>(); + let selection = pruner + .prune_pages(parquet_meta, &row_groups, None) + .unwrap() + .unwrap(); + + let expected_selection = expected_selection.clone(); + assert_eq!( + expected_selection, selection, + "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", + expected_selection, selection, scenario, predicate + ); + } } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq() { - test( - Scenario::Int32, - "i = 1", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun_and_eq() { - test( - Scenario::Int32, - "abs(i) = 1 and i = 1", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun() { - test( - Scenario::Int32, - "abs(i) = 1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr() { - test( - Scenario::Int32, - "i+1 = 1", - RowSelection::from(vec![ - RowSelector::skip(5), - RowSelector::select(10), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr_subtract() { - test( - Scenario::Int32, - "1-i > 1", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 0 -5.0 -1.0 -// page-1 0 -4.0 0.0 -// page-2 0 0.0 4.0 -// page-3 0 5.0 9.0 -async fn test_f64_lt() { - test( - Scenario::Float64, - "f < 1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::Float64, - "-f > -1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +async fn test_basic() { + let test_cases = vec![ + // Timestamp tests + // null count min max + // page-0 1 2020-01-01T01:01:01.000000 2020-01-02T01:01:01.000000 + // page-1 1 2020-01-01T01:01:11.000000 2020-01-02T01:01:11.000000 + // page-2 1 2020-01-01T01:11:01.000000 2020-01-02T01:11:01.000000 + // page-3 1 2020-01-11T01:01:01.000000 2020-01-12T01:01:01.000000 + ( + Scenario::Timestamp, + "micros < to_timestamp('2020-01-02 01:01:11Z')", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + // Date tests + // null count min max + // page-0 1 2020-01-01 2020-01-04 + // page-1 1 2020-01-11 2020-01-14 + // page-2 1 2020-10-27 2020-10-30 + // page-3 1 2029-11-09 2029-11-12 + ( + Scenario::Date, + "date32 < to_date('2020-01-02')", + RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(15)]), + ), + // Int32 tests + // null count min max + // page-0 0 -5 -1 + // page-1 0 -4 0 + // page-2 0 0 4 + // page-3 0 5 9 + ( + Scenario::Int32, + "i < 1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + // result of sql "SELECT * FROM t where i < 1" is same as + // "SELECT * FROM t where -i > -1" + ( + Scenario::Int32, + "-i > -1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Int32, + "i > 8", + RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), + ), + ( + Scenario::Int32, + "-i < -8", + RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), + ), + ( + Scenario::Int32, + "i = 1", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "abs(i) = 1 and i = 1", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "abs(i) = 1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Int32, + "i+1 = 1", + RowSelection::from(vec![ + RowSelector::skip(5), + RowSelector::select(10), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "1-i > 1", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), + ), + // Float64 tests + // null count min max + // page-0 0 -5.0 -1.0 + // page-1 0 -4.0 0.0 + // page-2 0 0.0 4.0 + // page-3 0 5.0 9.0 + ( + Scenario::Float64, + "f < 1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Float64, + "-f > -1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Float64, + "abs(f - 1) <= 0.000001 and f >= 0.1", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Float64, + "abs(f-1) <= 0.000001", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Float64, + "f+1 > 1.1", + RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(10)]), + ), + ( + Scenario::Float64, + "1-f > 1", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), + ), + // Int32 in list tests + // null count min max + // page-0 0 -5 -1 + // page-1 0 -4 0 + // page-2 0 0 4 + // page-3 0 5 9 + ( + Scenario::Int32, + "i in (1)", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "i in (100)", + RowSelection::from(vec![RowSelector::skip(20)]), + ), + ( + Scenario::Int32, + "i not in (1)", + RowSelection::from(vec![RowSelector::select(20)]), + ), + // Decimal tests + // The data type of decimal_col is decimal(9,2) + // There are three pages each 5 rows: + // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] + ( + Scenario::Decimal, + "decimal_col < 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // compare with the casted decimal value + ( + Scenario::Decimal, + "decimal_col < cast(4.55 as decimal(20,2))", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // The data type of decimal_col is decimal(38,2) + ( + Scenario::DecimalLargePrecision, + "decimal_col < 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // compare with the casted decimal value + ( + Scenario::DecimalLargePrecision, + "decimal_col < cast(4.55 as decimal(20,2))", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // The data type of decimal_col is decimal(9,2) + // There are three pages: + // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] + ( + Scenario::Decimal, + "decimal_col = 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::Decimal, + "decimal_col = 4.00", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // The data type of decimal_col is decimal(38,2) + ( + Scenario::DecimalLargePrecision, + "decimal_col = 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col = 4.00", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col = 30.00", + RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(5)]), + ), + ]; + + test_batch(&test_cases).await; } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun_and_gt() { - test( - Scenario::Float64, - "abs(f - 1) <= 0.000001 and f >= 0.1", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun() { - test( - Scenario::Float64, - "abs(f-1) <= 0.000001", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr() { - test( - Scenario::Float64, - "f+1 > 1.1", - RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(10)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr_subtract() { - test( - Scenario::Float64, - "1-f > 1", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 0 -5 -1 -// page-1 0 -4 0 -// page-2 0 0 4 -// page-3 0 5 9 -async fn test_int32_eq_in_list() { - test( - Scenario::Int32, - "i in (1)", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_2() { - test( - Scenario::Int32, - "i in (100)", - RowSelection::from(vec![RowSelector::skip(20)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_negated() { - test( - Scenario::Int32, - "i not in (1)", - RowSelection::from(vec![RowSelector::select(20)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_lt() { - // The data type of decimal_col is decimal(9,2) - // There are three pages each 5 rows: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col < 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - // compare with the casted decimal value - test( - Scenario::Decimal, - "decimal_col < cast(4.55 as decimal(20,2))", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col < 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - // compare with the casted decimal value - test( - Scenario::DecimalLargePrecision, - "decimal_col < cast(4.55 as decimal(20,2))", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_eq() { - // The data type of decimal_col is decimal(9,2) - // There are three pages: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col = 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::Decimal, - "decimal_col = 4.00", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col = 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col = 4.00", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col = 30.00", - RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_decimal_in_list() { // The data type of decimal_col is decimal(9,2) // There are three pages: // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col in (4,3,123456789123)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::Decimal, - "decimal_col in (4.00,3.00,11.2345)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4,3,123456789123)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4.00,3.00,11.2345,1)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; + let cases = vec![ + ( + Scenario::Decimal, + "decimal_col in (4,3,123456789123)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::Decimal, + "decimal_col in (4.00,3.00,11.2345)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4,3,123456789123)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4.00,3.00,11.2345,1)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ]; + + test_batch(&cases).await; } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_periods_in_column_names() { - // There are three row groups for "service.name", each with 5 rows = 15 rows total - // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], - // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], - // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], - test( - Scenario::PeriodsInColumnNames, - // use double quotes to use column named "service.name" - "\"service.name\" = 'frontend'", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "name <> 'HTTP GET / DISPATCH'", - RowSelection::from(vec![ - RowSelector::skip(5), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", - RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(10)]), - ) - .await; + let test_cases = vec![ + // Tests for periods in column names + // There are three row groups for "service.name", each with 5 rows = 15 rows total + // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], + // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], + // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], + ( + Scenario::PeriodsInColumnNames, + // use double quotes to use column named "service.name" + "\"service.name\" = 'frontend'", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::PeriodsInColumnNames, + "name <> 'HTTP GET / DISPATCH'", + RowSelection::from(vec![ + RowSelector::skip(5), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::PeriodsInColumnNames, + "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", + RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(10)]), + ), + ]; + + test_batch(&test_cases).await; } diff --git a/src/query/service/tests/it/parquet_rs/prune_row_groups.rs b/src/query/service/tests/it/parquet_rs/prune_row_groups.rs index a0f18b4c8c61..28edf445e1af 100644 --- a/src/query/service/tests/it/parquet_rs/prune_row_groups.rs +++ b/src/query/service/tests/it/parquet_rs/prune_row_groups.rs @@ -26,262 +26,177 @@ use super::utils::get_data_source_plan; use crate::parquet_rs::utils::create_parquet_test_fixture; /// Enable row groups pruning and test. -async fn test(scenario: Scenario, predicate: &str, expected_rgs: Vec) { - test_impl(scenario, predicate, expected_rgs, true).await +async fn test_batch(args: &[(Scenario, &str, Vec)]) { + test_impl_batch(args, true).await } // Disable row groups pruning and test. -async fn test_without_prune(scenario: Scenario, predicate: &str, expected_rgs: Vec) { - test_impl(scenario, predicate, expected_rgs, false).await +async fn test_batch_without_prune(args: &[(Scenario, &str, Vec)]) { + test_impl_batch(args, false).await } -async fn test_impl(scenario: Scenario, predicate: &str, expected_rgs: Vec, prune: bool) { - let (file, arrow_schema) = make_test_file_rg(scenario).await; - let file_path = file.path().to_string_lossy(); - let sql = format!("select * from 'fs://{file_path}' where {predicate}"); - +async fn test_impl_batch(args: &[(Scenario, &str, Vec)], prune: bool) { let fixture = create_parquet_test_fixture().await; - let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) - .await - .unwrap(); - let parquet_meta = parquet::file::footer::parse_metadata(file.as_file()).unwrap(); - let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); - let leaf_fields = Arc::new(schema.leaf_fields()); - - let pruner = ParquetRSPruner::try_create( - FunctionContext::default(), - Arc::new(schema), - leaf_fields, - &plan.push_downs, - ParquetReadOptions::default() - .with_prune_row_groups(prune) - .with_prune_pages(false), - vec![], - ) - .unwrap(); - - let (rgs, _) = pruner.prune_row_groups(&parquet_meta, None, None).unwrap(); - assert_eq!( - expected_rgs, rgs, - "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", - expected_rgs, rgs, scenario, predicate - ); -} + for (scenario, predicate, expected_rgs) in args { + let (file, arrow_schema) = make_test_file_rg(*scenario).await; + let file_path = file.path().to_string_lossy(); + let sql = format!("select * from 'fs://{file_path}' where {predicate}"); + + let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) + .await + .unwrap(); + let parquet_meta = parquet::file::footer::parse_metadata(file.as_file()).unwrap(); + let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); + let leaf_fields = Arc::new(schema.leaf_fields()); + + let pruner = ParquetRSPruner::try_create( + FunctionContext::default(), + Arc::new(schema), + leaf_fields, + &plan.push_downs, + ParquetReadOptions::default() + .with_prune_row_groups(prune) + .with_prune_pages(false), + vec![], + ) + .unwrap(); -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_timestamp() { - test( - Scenario::Timestamp, - "micros < to_timestamp('2020-01-02 01:01:11Z')", - vec![0, 1, 2], - ) - .await; -} + let (rgs, _) = pruner.prune_row_groups(&parquet_meta, None, None).unwrap(); -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_date() { - test(Scenario::Date, "date32 < to_date('2020-01-02')", vec![0]).await; + assert_eq!( + expected_rgs.to_vec(), + rgs, + "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", + expected_rgs, + rgs, + scenario, + predicate + ); + } } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_disabled() { - test( + test_batch(&[( Scenario::Timestamp, "micros < to_timestamp('2020-01-02 01:01:11Z')", vec![0, 1, 2], - ) + )]) .await; - - test_without_prune( + test_batch_without_prune(&[( Scenario::Timestamp, "micros < to_timestamp('2020-01-02 01:01:11Z')", vec![0, 1, 2, 3], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_lt() { - test(Scenario::Int32, "i < 1", vec![0, 1, 2]).await; - // result of sql "SELECT * FROM t where i < 1" is same as - // "SELECT * FROM t where -i > -1" - test(Scenario::Int32, " -i > -1", vec![0, 1, 2]).await -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq() { - test(Scenario::Int32, "i = 1", vec![2]).await; -} -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun_and_eq() { - test(Scenario::Int32, "abs(i) = 1 and i = 1", vec![2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun() { - test(Scenario::Int32, "abs(i) = 1", vec![0, 1, 2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr() { - test(Scenario::Int32, "i+1 = 1", vec![1, 2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr_subtract() { - test(Scenario::Int32, "1-i > 1", vec![0, 1]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_lt() { - test(Scenario::Float64, "f < 1", vec![0, 1, 2]).await; - test(Scenario::Float64, "-f > -1", vec![0, 1, 2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun_and_gt() { - test( - Scenario::Float64, - "abs(f - 1) <= 0.000001 and f >= 0.1", - vec![2], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun() { - test(Scenario::Float64, "abs(f-1) <= 0.000001", vec![2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr() { - test(Scenario::Float64, "f+1 > 1.1", vec![2, 3]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr_subtract() { - test(Scenario::Float64, "1-f > 1", vec![0, 1]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list() { - test(Scenario::Int32, "i in (1)", vec![2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_2() { - test(Scenario::Int32, "i in (1000)", vec![]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_negated() { - test(Scenario::Int32, "i not in (1)", vec![0, 1, 2, 3]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_lt() { - // The data type of decimal_col is decimal(9,2) - // There are three row groups: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test(Scenario::Decimal, "decimal_col < 4", vec![0, 1]).await; - // compare with the casted decimal value - test( - Scenario::Decimal, - "decimal_col < cast(4.55 as decimal(20,2))", - vec![0, 1], - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test(Scenario::DecimalLargePrecision, "decimal_col < 4", vec![ - 0, 1, - ]) - .await; - // compare with the casted decimal value - test( - Scenario::DecimalLargePrecision, - "decimal_col < cast(4.55 as decimal(20,2))", - vec![0, 1], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_eq() { - // The data type of decimal_col is decimal(9,2) - // There are three row groups: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test(Scenario::Decimal, "decimal_col = 4", vec![0, 1]).await; - test(Scenario::Decimal, "decimal_col = 4.00", vec![0, 1]).await; - - // The data type of decimal_col is decimal(38,2) - test(Scenario::DecimalLargePrecision, "decimal_col = 4", vec![ - 0, 1, - ]) - .await; - test(Scenario::DecimalLargePrecision, "decimal_col = 4.00", vec![ - 0, 1, - ]) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_in_list() { - // The data type of decimal_col is decimal(9,2) - // There are three row groups: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col in (4,3,123456789123)", - vec![0, 1], - ) - .await; - test( - Scenario::Decimal, - "decimal_col in (4.00,3.00,11.2345)", - vec![0, 1], - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4,3,123456789123)", - vec![0, 1], - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4.00,3.00,11.2345)", - vec![0, 1], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_periods_in_column_names() { - // There are three row groups for "service.name", each with 5 rows = 15 rows total - // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], - // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], - // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], - test( - Scenario::PeriodsInColumnNames, + )]) + .await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +async fn test_various_rg_scenarios() { + let test_cases = vec![ + ( + Scenario::Timestamp, + "micros < to_timestamp('2020-01-02 01:01:11Z')", + vec![0, 1, 2], + ), + // Date scenario + (Scenario::Date, "date32 < to_date('2020-01-02')", vec![0]), + // Int32 scenarios + (Scenario::Int32, "i < 1", vec![0, 1, 2]), + // result of sql "SELECT * FROM t where i < 1" is same as + // "SELECT * FROM t where -i > -1" + (Scenario::Int32, " -i > -1", vec![0, 1, 2]), + (Scenario::Int32, "i = 1", vec![2]), + (Scenario::Int32, "abs(i) = 1 and i = 1", vec![2]), + (Scenario::Int32, "abs(i) = 1", vec![0, 1, 2]), + (Scenario::Int32, "i+1 = 1", vec![1, 2]), + (Scenario::Int32, "1-i > 1", vec![0, 1]), + (Scenario::Int32, "i in (1)", vec![2]), + (Scenario::Int32, "i in (1000)", vec![]), + (Scenario::Int32, "i not in (1)", vec![0, 1, 2, 3]), + // Float64 scenarios + (Scenario::Float64, "f < 1", vec![0, 1, 2]), + (Scenario::Float64, "-f > -1", vec![0, 1, 2]), + ( + Scenario::Float64, + "abs(f - 1) <= 0.000001 and f >= 0.1", + vec![2], + ), + (Scenario::Float64, "abs(f-1) <= 0.000001", vec![2]), + (Scenario::Float64, "f+1 > 1.1", vec![2, 3]), + (Scenario::Float64, "1-f > 1", vec![0, 1]), + // Decimal scenarios + // The data type of decimal_col is decimal(9,2) + // There are three row groups: + // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] + (Scenario::Decimal, "decimal_col < 4", vec![0, 1]), + // compare with the casted decimal value + ( + Scenario::Decimal, + "decimal_col < cast(4.55 as decimal(20,2))", + vec![0, 1], + ), + (Scenario::Decimal, "decimal_col = 4", vec![0, 1]), + (Scenario::Decimal, "decimal_col = 4.00", vec![0, 1]), + ( + Scenario::Decimal, + "decimal_col in (4,3,123456789123)", + vec![0, 1], + ), + ( + Scenario::Decimal, + "decimal_col in (4.00,3.00,11.2345)", + vec![0, 1], + ), + // DecimalLargePrecision scenarios + // The data type of decimal_col is decimal(38,2) + (Scenario::DecimalLargePrecision, "decimal_col < 4", vec![ + 0, 1, + ]), + ( + Scenario::DecimalLargePrecision, + "decimal_col < cast(4.55 as decimal(20,2))", + vec![0, 1], + ), + (Scenario::DecimalLargePrecision, "decimal_col = 4", vec![ + 0, 1, + ]), + (Scenario::DecimalLargePrecision, "decimal_col = 4.00", vec![ + 0, 1, + ]), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4,3,123456789123)", + vec![0, 1], + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4.00,3.00,11.2345)", + vec![0, 1], + ), + // PeriodsInColumnNames scenarios + // There are three row groups for "service.name", each with 5 rows = 15 rows total + // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], + // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], + // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], // use double quotes to use column named "service.name" - "\"service.name\" = 'frontend'", - vec![0, 1], - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "name <> 'HTTP GET / DISPATCH'", - vec![1], - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", - vec![0], - ) - .await; + ( + Scenario::PeriodsInColumnNames, + "\"service.name\" = 'frontend'", + vec![0, 1], + ), + ( + Scenario::PeriodsInColumnNames, + "name <> 'HTTP GET / DISPATCH'", + vec![1], + ), + ( + Scenario::PeriodsInColumnNames, + "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", + vec![0], + ), + ]; + + test_batch(&test_cases).await; } diff --git a/src/query/service/tests/it/storages/testdata/caches_table.txt b/src/query/service/tests/it/storages/testdata/caches_table.txt index 373568235de0..11ca7da3b0c9 100644 --- a/src/query/service/tests/it/storages/testdata/caches_table.txt +++ b/src/query/service/tests/it/storages/testdata/caches_table.txt @@ -9,7 +9,7 @@ DB.Table: 'system'.'caches', Table: caches-table_id:1, ver:0, Engine: SystemCach | 'test-node' | 'memory_cache_compact_segment_info' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_inverted_index_file' | 0 | 0 | 2147483648 | 'bytes' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_inverted_index_file_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_parquet_file_meta' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_parquet_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_prune_partitions' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_table_snapshot' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_table_statistics' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | diff --git a/src/query/storages/common/cache/Cargo.toml b/src/query/storages/common/cache/Cargo.toml index 7cb83f2577ee..773e6eb71585 100644 --- a/src/query/storages/common/cache/Cargo.toml +++ b/src/query/storages/common/cache/Cargo.toml @@ -30,6 +30,7 @@ crossbeam-channel = "0.5.6" hex = "0.4.3" log = { workspace = true } parking_lot = { workspace = true } +parquet = { workspace = true } rayon = "1.9.0" rustix = "0.38.37" siphasher = "0.3.10" diff --git a/src/query/storages/common/cache/src/caches.rs b/src/query/storages/common/cache/src/caches.rs index 8d36618e42d2..250574e5fc7d 100644 --- a/src/query/storages/common/cache/src/caches.rs +++ b/src/query/storages/common/cache/src/caches.rs @@ -14,7 +14,6 @@ use std::sync::Arc; -use databend_common_arrow::parquet::metadata::FileMetaData; use databend_common_cache::MemSized; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; @@ -27,6 +26,7 @@ use databend_storages_common_table_meta::meta::CompactSegmentInfo; use databend_storages_common_table_meta::meta::SegmentInfo; use databend_storages_common_table_meta::meta::TableSnapshot; use databend_storages_common_table_meta::meta::TableSnapshotStatistics; +use parquet::file::metadata::ParquetMetaData; use crate::manager::CacheManager; use crate::CacheAccessor; @@ -50,8 +50,8 @@ pub type BloomIndexMetaCache = InMemoryLruCache; pub type InvertedIndexMetaCache = InMemoryLruCache; pub type InvertedIndexFileCache = InMemoryLruCache; -/// In memory object cache of parquet FileMetaData of external parquet files -pub type FileMetaDataCache = InMemoryLruCache; +/// In memory object cache of parquet FileMetaData of external parquet rs files +pub type ParquetMetaDataCache = InMemoryLruCache; pub type PrunePartitionsCache = InMemoryLruCache<(PartStatistics, Partitions)>; @@ -122,10 +122,10 @@ impl CachedObject for Xor8Filter { } } -impl CachedObject for FileMetaData { - type Cache = FileMetaDataCache; +impl CachedObject for ParquetMetaData { + type Cache = ParquetMetaDataCache; fn cache() -> Option { - CacheManager::instance().get_file_meta_data_cache() + CacheManager::instance().get_parquet_meta_data_cache() } } @@ -234,8 +234,8 @@ impl From for CacheValue { } } -impl From for CacheValue { - fn from(value: FileMetaData) -> Self { +impl From for CacheValue { + fn from(value: ParquetMetaData) -> Self { CacheValue { inner: Arc::new(value), mem_bytes: 0, diff --git a/src/query/storages/common/cache/src/manager.rs b/src/query/storages/common/cache/src/manager.rs index 6553f64f30fd..3bf65686d19d 100644 --- a/src/query/storages/common/cache/src/manager.rs +++ b/src/query/storages/common/cache/src/manager.rs @@ -28,9 +28,9 @@ use crate::caches::BloomIndexMetaCache; use crate::caches::CacheValue; use crate::caches::ColumnArrayCache; use crate::caches::CompactSegmentInfoCache; -use crate::caches::FileMetaDataCache; use crate::caches::InvertedIndexFileCache; use crate::caches::InvertedIndexMetaCache; +use crate::caches::ParquetMetaDataCache; use crate::caches::PrunePartitionsCache; use crate::caches::TableSnapshotCache; use crate::caches::TableSnapshotStatisticCache; @@ -38,7 +38,7 @@ use crate::InMemoryLruCache; use crate::TableDataCache; use crate::TableDataCacheBuilder; -static DEFAULT_FILE_META_DATA_CACHE_ITEMS: usize = 3000; +static DEFAULT_PARQUET_META_DATA_CACHE_ITEMS: usize = 3000; /// Where all the caches reside pub struct CacheManager { @@ -50,7 +50,7 @@ pub struct CacheManager { inverted_index_meta_cache: Option, inverted_index_file_cache: Option, prune_partitions_cache: Option, - parquet_file_meta_data_cache: Option, + parquet_meta_data_cache: Option, table_data_cache: Option, in_memory_table_data_cache: Option, block_meta_cache: Option, @@ -122,7 +122,7 @@ impl CacheManager { inverted_index_meta_cache: None, inverted_index_file_cache: None, prune_partitions_cache: None, - parquet_file_meta_data_cache: None, + parquet_meta_data_cache: None, table_statistic_cache: None, table_data_cache, in_memory_table_data_cache, @@ -171,9 +171,9 @@ impl CacheManager { MEMORY_CACHE_PRUNE_PARTITIONS, ); - let parquet_file_meta_data_cache = Self::new_named_items_cache( - DEFAULT_FILE_META_DATA_CACHE_ITEMS, - MEMORY_CACHE_PARQUET_FILE_META, + let parquet_meta_data_cache = Self::new_named_items_cache( + DEFAULT_PARQUET_META_DATA_CACHE_ITEMS, + MEMORY_CACHE_PARQUET_META_DATA, ); let block_meta_cache = Self::new_named_items_cache( @@ -189,11 +189,11 @@ impl CacheManager { inverted_index_meta_cache, inverted_index_file_cache, prune_partitions_cache, - parquet_file_meta_data_cache, table_statistic_cache, table_data_cache, in_memory_table_data_cache, block_meta_cache, + parquet_meta_data_cache, })); } @@ -240,8 +240,8 @@ impl CacheManager { self.prune_partitions_cache.clone() } - pub fn get_file_meta_data_cache(&self) -> Option { - self.parquet_file_meta_data_cache.clone() + pub fn get_parquet_meta_data_cache(&self) -> Option { + self.parquet_meta_data_cache.clone() } pub fn get_table_data_cache(&self) -> Option { @@ -298,7 +298,7 @@ impl CacheManager { } const MEMORY_CACHE_TABLE_DATA: &str = "memory_cache_table_data"; -const MEMORY_CACHE_PARQUET_FILE_META: &str = "memory_cache_parquet_file_meta"; +const MEMORY_CACHE_PARQUET_META_DATA: &str = "memory_cache_parquet_meta_data"; const MEMORY_CACHE_PRUNE_PARTITIONS: &str = "memory_cache_prune_partitions"; const MEMORY_CACHE_INVERTED_INDEX_FILE: &str = "memory_cache_inverted_index_file"; const MEMORY_CACHE_INVERTED_INDEX_FILE_META_DATA: &str = diff --git a/src/query/storages/common/pruner/src/lib.rs b/src/query/storages/common/pruner/src/lib.rs index 04bc0341fc43..0e0f938ce3ec 100644 --- a/src/query/storages/common/pruner/src/lib.rs +++ b/src/query/storages/common/pruner/src/lib.rs @@ -18,6 +18,7 @@ mod block_meta; mod internal_column_pruner; mod limiter_pruner; mod page_pruner; +pub mod partition_prunner; mod range_pruner; mod topn_pruner; diff --git a/src/query/storages/common/pruner/src/partition_prunner.rs b/src/query/storages/common/pruner/src/partition_prunner.rs new file mode 100644 index 000000000000..6211e77d9dc0 --- /dev/null +++ b/src/query/storages/common/pruner/src/partition_prunner.rs @@ -0,0 +1,91 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::Expr; +use databend_common_expression::FunctionContext; +use databend_common_expression::Scalar; +use databend_common_expression::TableField; +use databend_common_expression::TableSchema; +use databend_storages_common_index::RangeIndex; +use databend_storages_common_table_meta::meta::ColumnStatistics; +use databend_storages_common_table_meta::meta::StatisticsOfColumns; + +pub struct PartitionPruner { + pub filter: Expr, + pub partition_schema: Arc, + + leaf_fields: Vec, + + pub range_filter: RangeIndex, +} + +pub trait FetchPartitionScalars { + fn eval(_item: &T, _partition_fields: &[TableField]) -> Result>; +} + +impl PartitionPruner { + pub fn try_create( + ctx: FunctionContext, + filter: Expr, + partition_schema: Arc, + full_schema: Arc, + ) -> Result { + let range_filter = RangeIndex::try_create( + ctx, + &filter, + full_schema.clone(), + StatisticsOfColumns::default(), + )?; + Ok(PartitionPruner { + filter, + partition_schema, + leaf_fields: full_schema.leaf_fields(), + range_filter, + }) + } + + pub fn prune(&self, partitions: Vec) -> Result> + where F: FetchPartitionScalars { + let filtered_partitions = partitions + .into_iter() + .filter(|p| self.should_keep::(p).unwrap_or(true)) + .collect(); + Ok(filtered_partitions) + } + + pub fn should_keep(&self, partition: &T) -> Result + where F: FetchPartitionScalars { + let scalars = F::eval(partition, &self.partition_schema.fields)?; + let mut stats = HashMap::new(); + + for (index, scalar) in scalars.into_iter().enumerate() { + let null_count = u64::from(scalar.is_null()); + let column_stats = ColumnStatistics::new(scalar.clone(), scalar, null_count, 0, None); + + let mut f = self + .leaf_fields + .iter() + .filter(|f| f.name() == &self.partition_schema.field(index).name); + + if let Some(f) = f.next() { + stats.insert(f.column_id(), column_stats); + } + } + self.range_filter.apply(&stats, |_| false) + } +} diff --git a/src/query/storages/delta/Cargo.toml b/src/query/storages/delta/Cargo.toml index 428816eac2af..bd98a9a61050 100644 --- a/src/query/storages/delta/Cargo.toml +++ b/src/query/storages/delta/Cargo.toml @@ -14,10 +14,12 @@ databend-common-base = { workspace = true } databend-common-catalog = { workspace = true } databend-common-exception = { workspace = true } databend-common-expression = { workspace = true } +databend-common-functions = { workspace = true } databend-common-meta-app = { workspace = true } databend-common-pipeline-core = { workspace = true } databend-common-storage = { workspace = true } databend-common-storages-parquet = { workspace = true } +databend-storages-common-pruner = { workspace = true } databend-storages-common-table-meta = { workspace = true } deltalake = { workspace = true } fastrace = { workspace = true } @@ -30,9 +32,6 @@ tokio = { workspace = true } typetag = "0.2" url = "2.4.1" -[dev-dependencies] -maplit = "1.0.2" - [lints] workspace = true diff --git a/src/query/storages/delta/src/lib.rs b/src/query/storages/delta/src/lib.rs index 6df87495e1a1..dffe44433b0e 100644 --- a/src/query/storages/delta/src/lib.rs +++ b/src/query/storages/delta/src/lib.rs @@ -16,7 +16,6 @@ #![allow(clippy::diverging_sub_expression)] mod partition; -mod partition_columns; mod table; mod table_source; diff --git a/src/query/storages/delta/src/table.rs b/src/query/storages/delta/src/table.rs index 0361a8f72568..2bff4c9b76d2 100644 --- a/src/query/storages/delta/src/table.rs +++ b/src/query/storages/delta/src/table.rs @@ -18,9 +18,10 @@ use std::sync::Arc; use arrow_schema::Schema as ArrowSchema; use async_trait::async_trait; use databend_common_catalog::catalog::StorageDescription; +use databend_common_catalog::partition_columns::get_pushdown_without_partition_columns; +use databend_common_catalog::partition_columns::str_to_scalar; use databend_common_catalog::plan::DataSourcePlan; use databend_common_catalog::plan::ParquetReadOptions; -use databend_common_catalog::plan::PartInfo; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; use databend_common_catalog::plan::PartitionsShuffleKind; @@ -32,8 +33,10 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataSchema; use databend_common_expression::FieldIndex; +use databend_common_expression::Scalar; use databend_common_expression::TableField; use databend_common_expression::TableSchema; +use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::storage::StorageParams; use databend_common_pipeline_core::Pipeline; @@ -42,6 +45,8 @@ use databend_common_storages_parquet::ParquetFilesPart; use databend_common_storages_parquet::ParquetPart; use databend_common_storages_parquet::ParquetRSPruner; use databend_common_storages_parquet::ParquetRSReaderBuilder; +use databend_storages_common_pruner::partition_prunner::FetchPartitionScalars; +use databend_storages_common_pruner::partition_prunner::PartitionPruner; use databend_storages_common_table_meta::table::OPT_KEY_ENGINE_META; use deltalake::kernel::Add; use deltalake::DeltaTableBuilder; @@ -52,8 +57,6 @@ use tokio::sync::OnceCell; use url::Url; use crate::partition::DeltaPartInfo; -use crate::partition_columns::get_partition_values; -use crate::partition_columns::get_pushdown_without_partition_columns; use crate::table_source::DeltaTableSource; pub const DELTA_ENGINE: &str = "DELTA"; @@ -120,12 +123,11 @@ impl DeltaTable { }) } - #[allow(dead_code)] - fn get_partition_fields(&self) -> Result> { + fn get_partition_fields(&self) -> Result> { self.meta .partition_columns .iter() - .map(|name| self.info.meta.schema.field_with_name(name)) + .map(|name| self.info.meta.schema.field_with_name(name).cloned()) .collect() } @@ -261,7 +263,7 @@ impl DeltaTable { output, output_schema.clone(), parquet_reader.clone(), - self.get_partition_fields()?.into_iter().cloned().collect(), + self.get_partition_fields()?, ) }, max_threads.max(1), @@ -272,8 +274,8 @@ impl DeltaTable { #[async_backtrace::framed] async fn do_read_partitions( &self, - _ctx: Arc, - _push_downs: Option, + ctx: Arc, + push_downs: Option, ) -> Result<(PartStatistics, Partitions)> { let table = self.table().await?; @@ -281,14 +283,34 @@ impl DeltaTable { let mut read_bytes = 0; let partition_fields = self.get_partition_fields()?; - let adds = table + let mut adds = table .snapshot() .and_then(|f| f.file_actions()) .map_err(|e| { ErrorCode::ReadTableDataError(format!("Cannot read file_actions: {e:?}")) })?; + + let filter_expression = push_downs.as_ref().and_then(|p| { + p.filters + .as_ref() + .map(|filter| filter.filter.as_expr(&BUILTIN_FUNCTIONS)) + }); + let total_files = adds.len(); + if !partition_fields.is_empty() { + if let Some(expr) = filter_expression { + let partition_pruner = PartitionPruner::try_create( + ctx.get_function_context()?, + expr, + Arc::new(TableSchema::new(partition_fields.clone())), + self.schema(), + )?; + + adds = partition_pruner.prune::(adds)?; + } + } + #[derive(serde::Deserialize)] struct Stats { #[serde(rename = "numRecords")] @@ -311,9 +333,8 @@ impl DeltaTable { ).unwrap_or(1); read_rows += num_records as usize; read_bytes += add.size as usize; - let partition_values = get_partition_values(add, &partition_fields[..])?; - Ok(Arc::new( - Box::new(DeltaPartInfo { + let partition_values = get_partition_values(add, &partition_fields)?; + Ok(Arc::new(Box::new(DeltaPartInfo { partition_values, data: ParquetPart::ParquetFiles( ParquetFilesPart { @@ -321,8 +342,7 @@ impl DeltaTable { estimated_uncompressed_size: add.size as u64, // This field is not used here. }, ), - }) as Box - )) + }) as _)) }) .collect::>>()?; @@ -333,6 +353,14 @@ impl DeltaTable { } } +pub struct DeltaToScalar; + +impl FetchPartitionScalars for DeltaToScalar { + fn eval(add: &Add, partition_fields: &[TableField]) -> Result> { + get_partition_values(add, partition_fields) + } +} + #[async_trait] impl Table for DeltaTable { fn as_any(&self) -> &dyn Any { @@ -384,3 +412,20 @@ impl Table for DeltaTable { true } } + +pub fn get_partition_values(add: &Add, fields: &[TableField]) -> Result> { + let mut values = Vec::with_capacity(fields.len()); + for f in fields { + match add.partition_values.get(&f.name) { + Some(Some(v)) => values.push(str_to_scalar(v, &f.data_type().into())?), + Some(None) => values.push(Scalar::Null), + None => { + return Err(ErrorCode::BadArguments(format!( + "partition value for column {} not found", + &f.name + ))); + } + } + } + Ok(values) +} diff --git a/src/query/storages/hive/hive/Cargo.toml b/src/query/storages/hive/hive/Cargo.toml index b71e36560838..90ccd787fb23 100644 --- a/src/query/storages/hive/hive/Cargo.toml +++ b/src/query/storages/hive/hive/Cargo.toml @@ -15,7 +15,6 @@ async-backtrace = { workspace = true } async-recursion = "1.1.1" async-trait = { workspace = true } chrono = { workspace = true } -databend-common-arrow = { workspace = true } databend-common-base = { workspace = true } databend-common-catalog = { workspace = true } databend-common-config = { workspace = true } @@ -29,8 +28,8 @@ databend-common-pipeline-core = { workspace = true } databend-common-pipeline-sources = { workspace = true } databend-common-sql = { workspace = true } databend-common-storage = { workspace = true } -databend-storages-common-cache = { workspace = true } -databend-storages-common-index = { workspace = true } +databend-common-storages-parquet = { workspace = true } +databend-storages-common-pruner = { workspace = true } databend-storages-common-table-meta = { workspace = true } fastrace = { workspace = true } faststr = "0.2" @@ -38,6 +37,7 @@ futures = { workspace = true } hive_metastore = "0.1.0" log = { workspace = true } opendal = { workspace = true } +parquet = { workspace = true } recursive = "0.1.1" serde = { workspace = true } typetag = { workspace = true } diff --git a/src/query/storages/hive/hive/src/hive_block_filter.rs b/src/query/storages/hive/hive/src/hive_block_filter.rs deleted file mode 100644 index 9414bf48b695..000000000000 --- a/src/query/storages/hive/hive/src/hive_block_filter.rs +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; - -use databend_common_arrow::parquet::metadata::RowGroupMetaData; -use databend_common_arrow::parquet::statistics::BinaryStatistics; -use databend_common_arrow::parquet::statistics::BooleanStatistics; -use databend_common_arrow::parquet::statistics::PrimitiveStatistics; -use databend_common_arrow::parquet::statistics::Statistics; -use databend_common_expression::types::number::F32; -use databend_common_expression::types::number::F64; -use databend_common_expression::types::BooleanType; -use databend_common_expression::types::NumberDataType; -use databend_common_expression::types::NumberType; -use databend_common_expression::types::StringType; -use databend_common_expression::types::ValueType; -use databend_common_expression::Scalar; -use databend_common_expression::TableDataType; -use databend_common_expression::TableField; -use databend_common_expression::TableSchema; -use databend_storages_common_index::RangeIndex; -use databend_storages_common_table_meta::meta::ColumnStatistics; -use databend_storages_common_table_meta::meta::StatisticsOfColumns; - -use crate::hive_parquet_block_reader::HiveBlockReader; -use crate::hive_table::HIVE_DEFAULT_PARTITION; - -#[derive(Clone)] -pub struct HiveBlockFilter { - range_filter: Option, - projections: Vec, - data_schema: Arc, -} - -impl HiveBlockFilter { - pub fn create( - range_filter: Option, - projections: Vec, - data_schema: Arc, - ) -> Self { - Self { - range_filter, - projections, - data_schema, - } - } - - // true: rowgroup if filtered by predict - pub fn filter( - &self, - row_group: &RowGroupMetaData, - part_columns: HashMap, - ) -> bool { - if let Some(filter) = &self.range_filter { - let mut statistics = StatisticsOfColumns::new(); - for col in self.projections.iter() { - let column_meta = - HiveBlockReader::get_parquet_column_metadata(row_group, col.name()); - if let Ok(meta) = column_meta { - let in_memory_size = meta.uncompressed_size(); - if let Ok(stats) = meta.statistics().transpose() { - // if stats is none, we couldn't make a decision whether the block should be filtered - let stats = match stats { - None => return false, - Some(stats) => stats, - }; - if let Some((max, min, null_count)) = - Self::get_max_min_stats(col.data_type(), &*stats) - { - let col_stats = ColumnStatistics::new( - min, - max, - null_count as u64, - in_memory_size as u64, - None, - ); - if let Some((index, _)) = self.data_schema.column_with_name(col.name()) - { - statistics.insert(index as u32, col_stats); - } - } - } - } - } - - for (p_key, p_value) in part_columns { - if let Some((idx, _)) = self.data_schema.column_with_name(&p_key) { - let mut null_count = 0; - let v = if p_value == HIVE_DEFAULT_PARTITION { - null_count = row_group.num_rows(); - Scalar::Null - } else { - Scalar::String(p_value) - }; - - let col_stats = ColumnStatistics::new(v.clone(), v, null_count as u64, 0, None); - statistics.insert(idx as u32, col_stats); - } - } - - if let Ok(ret) = filter.apply(&statistics, |_| false) { - if !ret { - return true; - } - } - } - false - } - - fn get_max_min_stats( - column_type: &TableDataType, - stats: &dyn Statistics, - ) -> Option<(Scalar, Scalar, i64)> { - match column_type { - TableDataType::Number(NumberDataType::UInt8) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u8); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u8); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::UInt16) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u16); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u16); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::UInt32) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u32); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u32); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::UInt64) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u64); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u64); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int8) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as i8); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as i8); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int16) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as i16); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as i16); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int32) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap()); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int64) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap()); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Float32) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap().into()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap().into()); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Float64) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap().into()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap().into()); - Some((max, min, null_count)) - } - } - TableDataType::Boolean => { - let s = stats.as_any().downcast_ref::().unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = BooleanType::upcast_scalar(s.max_value.unwrap()); - let min = BooleanType::upcast_scalar(s.min_value.unwrap()); - Some((max, min, null_count)) - } - } - TableDataType::String => { - let s = stats.as_any().downcast_ref::().unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = StringType::upcast_scalar( - String::from_utf8(s.max_value.clone().unwrap()).ok()?, - ); - let min = StringType::upcast_scalar( - String::from_utf8(s.min_value.clone().unwrap()).ok()?, - ); - Some((max, min, null_count)) - } - } - TableDataType::Nullable(inner_ty) => Self::get_max_min_stats(inner_ty.as_ref(), stats), - _ => None, - } - } -} diff --git a/src/query/storages/hive/hive/src/hive_blocks.rs b/src/query/storages/hive/hive/src/hive_blocks.rs deleted file mode 100644 index d21d426bdf86..000000000000 --- a/src/query/storages/hive/hive/src/hive_blocks.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_arrow::parquet::metadata::FileMetaData; -use databend_common_arrow::parquet::metadata::RowGroupMetaData; -use log::debug; - -use crate::HiveBlockFilter; -use crate::HivePartInfo; - -#[derive(Clone)] -pub struct HiveBlocks { - pub file_meta: Arc, - pub part: HivePartInfo, - pub valid_rowgroups: Vec, - pub current_index: usize, - pub hive_block_filter: Arc, -} - -impl HiveBlocks { - pub fn create( - file_meta: Arc, - part: HivePartInfo, - hive_block_filter: Arc, - ) -> Self { - Self { - file_meta, - part, - valid_rowgroups: vec![], - current_index: 0, - hive_block_filter, - } - } - - // there are some conditions to filter invalid row_groups: - // 1. the rowgroup doesn't belong to the partition - // 2. filtered by predict pushdown - pub fn prune(&mut self) -> bool { - let mut pruned_rg_cnt = 0; - for (idx, row_group) in self.file_meta.row_groups.iter().enumerate() { - let start = row_group.columns()[0].byte_range().0; - let mid = start + row_group.compressed_size() as u64 / 2; - if !self.part.range.contains(&mid) { - continue; - } - if self - .hive_block_filter - .filter(row_group, self.part.get_partition_map()) - { - pruned_rg_cnt += 1; - } else { - self.valid_rowgroups.push(idx); - } - } - debug!( - "hive parquet predict pushdown have pruned {} rowgroups", - pruned_rg_cnt - ); - self.has_blocks() - } - - pub fn get_part_info(&self) -> HivePartInfo { - self.part.clone() - } - - pub fn get_current_row_group_meta_data(&self) -> &RowGroupMetaData { - &self.file_meta.row_groups[self.get_current_rowgroup_index()] - } - - pub fn advance(&mut self) { - self.current_index += 1; - } - - pub fn has_blocks(&self) -> bool { - self.current_index < self.valid_rowgroups.len() - } - - fn get_current_rowgroup_index(&self) -> usize { - self.valid_rowgroups[self.current_index] - } -} diff --git a/src/query/storages/hive/hive/src/hive_file_splitter.rs b/src/query/storages/hive/hive/src/hive_file_splitter.rs deleted file mode 100644 index 466d660ec6a8..000000000000 --- a/src/query/storages/hive/hive/src/hive_file_splitter.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::ops::Range; -use std::sync::Arc; - -use databend_common_catalog::plan::PartInfo; - -use crate::HiveFileInfo; -use crate::HivePartInfo; - -#[derive(Clone, Debug)] -pub struct HiveFileSplitter { - min_split_size: u64, -} - -impl HiveFileSplitter { - pub fn create(min_split_size: u64) -> Self { - Self { min_split_size } - } - - pub fn split_length(&self, length: u64) -> Vec> { - let mut num = length / self.min_split_size; - let left = length % self.min_split_size; - if num == 0 || left > self.min_split_size / 3 { - num += 1; - } - - let mut res = vec![]; - for i in 0..num { - let start = i * self.min_split_size; - let end = match i == num - 1 { - true => length + 1, - false => (i + 1) * self.min_split_size, - }; - res.push(start..end); - } - res - } - - fn split_single_file(&self, hive_file_info: HiveFileInfo) -> Vec>> { - let splits = self.split_length(hive_file_info.length); - splits - .into_iter() - .map(|r| { - HivePartInfo::create( - hive_file_info.filename.clone(), - hive_file_info.partition.clone(), - r, - hive_file_info.length, - ) - }) - .collect() - } - - pub fn get_splits(&self, files: Vec) -> Vec>> { - files - .into_iter() - .flat_map(|hive_file| self.split_single_file(hive_file)) - .collect::>>>() - } -} diff --git a/src/query/storages/hive/hive/src/hive_meta_data_reader.rs b/src/query/storages/hive/hive/src/hive_meta_data_reader.rs deleted file mode 100644 index f847e8801378..000000000000 --- a/src/query/storages/hive/hive/src/hive_meta_data_reader.rs +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_arrow::parquet::metadata::FileMetaData; -use databend_common_arrow::parquet::read::read_metadata_async; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_storages_common_cache::CacheManager; -use databend_storages_common_cache::InMemoryItemCacheReader; -use databend_storages_common_cache::LoadParams; -use databend_storages_common_cache::Loader; -use opendal::Operator; - -pub struct LoaderWrapper(T); -pub type FileMetaDataReader = InMemoryItemCacheReader>; -pub struct MetaDataReader; - -impl MetaDataReader { - pub fn meta_data_reader(dal: Operator) -> FileMetaDataReader { - FileMetaDataReader::new( - CacheManager::instance().get_file_meta_data_cache(), - LoaderWrapper(dal), - ) - } -} - -#[async_trait::async_trait] -impl Loader for LoaderWrapper { - #[async_backtrace::framed] - async fn load(&self, params: &LoadParams) -> Result { - let size = match params.len_hint { - Some(v) => v, - None => self.0.stat(¶ms.location).await?.content_length(), - }; - let reader = self.0.reader(¶ms.location).await?; - - read_metadata_async(reader, size).await.map_err(|err| { - ErrorCode::Internal(format!( - "read file meta failed, {}, {:?}", - params.location, err - )) - }) - } -} diff --git a/src/query/storages/hive/hive/src/hive_parquet_block_reader.rs b/src/query/storages/hive/hive/src/hive_parquet_block_reader.rs deleted file mode 100644 index c059f2c6165f..000000000000 --- a/src/query/storages/hive/hive/src/hive_parquet_block_reader.rs +++ /dev/null @@ -1,359 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_arrow::arrow::datatypes::Field; -use databend_common_arrow::arrow::datatypes::Schema; -use databend_common_arrow::arrow::io::parquet::read::column_iter_to_arrays; -use databend_common_arrow::arrow::io::parquet::read::ArrayIter; -use databend_common_arrow::arrow::io::parquet::read::RowGroupDeserializer; -use databend_common_arrow::parquet::metadata::ColumnChunkMetaData; -use databend_common_arrow::parquet::metadata::FileMetaData; -use databend_common_arrow::parquet::metadata::RowGroupMetaData; -use databend_common_arrow::parquet::read::BasicDecompressor; -use databend_common_arrow::parquet::read::PageReader; -use databend_common_base::base::tokio::sync::Semaphore; -use databend_common_catalog::plan::Projection; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_expression::DataSchema; -use databend_common_expression::DataSchemaRef; -use databend_common_expression::TableField; -use databend_common_expression::TableSchemaRef; -use databend_storages_common_cache::LoadParams; -use opendal::Operator; - -use crate::hive_partition::HivePartInfo; -use crate::HivePartitionFiller; -use crate::MetaDataReader; - -#[derive(Clone)] -pub struct HiveBlockReader { - operator: Operator, - projection: Vec, - arrow_schema: Arc, - projected_schema: DataSchemaRef, - // have partition columns - output_schema: DataSchemaRef, - hive_partition_filler: Option, - chunk_size: usize, -} - -pub struct DataBlockDeserializer { - deserializer: RowGroupDeserializer, - drained: bool, -} - -impl DataBlockDeserializer { - fn new(deserializer: RowGroupDeserializer) -> Self { - let num_rows = deserializer.num_rows(); - Self { - deserializer, - drained: num_rows == 0, - } - } - - fn next_block( - &mut self, - schema: &DataSchema, - filler: &Option, - part_info: &HivePartInfo, - ) -> Result> { - if self.drained { - return Ok(None); - }; - - let opt = self.deserializer.next().transpose()?; - if let Some(chunk) = opt { - // If the `Vec>` we have passed into the `RowGroupDeserializer` - // is empty, the deserializer will returns an empty chunk as well(since now rows are consumed). - // In this case, mark self as drained. - if chunk.is_empty() { - self.drained = true; - } - - let block: DataBlock = DataBlock::from_arrow_chunk(&chunk, schema)?; - - return if let Some(filler) = filler { - let num_rows = self.deserializer.num_rows(); - let filled = filler.fill_data(block, part_info, num_rows)?; - Ok(Some(filled)) - } else { - Ok(Some(block)) - }; - } - - self.drained = true; - Ok(None) - } -} - -impl HiveBlockReader { - pub fn create( - operator: Operator, - schema: TableSchemaRef, - projection: Projection, - partition_keys: &Option>, - chunk_size: usize, - ) -> Result> { - let original_projection = match projection { - Projection::Columns(projection) => projection, - Projection::InnerColumns(b) => { - return Err(ErrorCode::Unimplemented(format!( - "not support inter columns in hive block reader,{:?}", - b - ))); - } - }; - let output_schema = - DataSchemaRef::new(DataSchema::from(&schema.project(&original_projection))); - - let (projection, partition_fields) = filter_hive_partition_from_partition_keys( - schema.clone(), - original_projection, - partition_keys, - ); - - let hive_partition_filler = if !partition_fields.is_empty() { - Some(HivePartitionFiller::create( - schema.clone(), - partition_fields, - )) - } else { - None - }; - - let projected_schema = DataSchemaRef::new(DataSchema::from(&schema.project(&projection))); - let arrow_schema = schema.as_ref().into(); - Ok(Arc::new(HiveBlockReader { - operator, - projection, - projected_schema, - output_schema, - arrow_schema: Arc::new(arrow_schema), - hive_partition_filler, - chunk_size, - })) - } - - fn to_deserialize( - column_meta: &ColumnChunkMetaData, - chunk: Vec, - rows: usize, - field: Field, - chunk_size: usize, - ) -> Result> { - let primitive_type = column_meta.descriptor().descriptor.primitive_type.clone(); - let pages = PageReader::new( - std::io::Cursor::new(chunk), - column_meta, - Arc::new(|_, _| true), - vec![], - usize::MAX, - ); - - let decompressor = BasicDecompressor::new(pages, vec![]); - Ok(column_iter_to_arrays( - vec![decompressor], - vec![&primitive_type], - field, - Some(chunk_size), - rows, - )?) - } - - pub fn get_parquet_column_metadata<'a>( - row_group: &'a RowGroupMetaData, - field_name: &str, - ) -> Result<&'a ColumnChunkMetaData> { - let column_meta: Vec<&ColumnChunkMetaData> = row_group - .columns() - .iter() - .filter(|x| { - x.descriptor().path_in_schema[0].to_lowercase() == field_name.to_lowercase() - }) - .collect(); - if column_meta.is_empty() { - return Err(ErrorCode::ParquetFileInvalid(format!( - "couldn't find column:{} in parquet file", - field_name - ))); - } else if column_meta.len() > 1 { - return Err(ErrorCode::ParquetFileInvalid(format!( - "find multi column:{} in parquet file", - field_name - ))); - } - Ok(column_meta[0]) - } - - #[async_backtrace::framed] - async fn read_column( - op: Operator, - path: String, - offset: u64, - length: u64, - semaphore: Arc, - ) -> Result> { - let handler = databend_common_base::runtime::spawn(async move { - let chunk = op - .read_with(&path) - .range(offset..offset + length) - .await? - .to_vec(); - - let _semaphore_permit = semaphore.acquire().await.unwrap(); - Ok(chunk) - }); - - match handler.await { - Ok(Ok(data)) => Ok(data), - Ok(Err(cause)) => Err(cause), - Err(cause) => Err(ErrorCode::TokioError(format!( - "Cannot join future {:?}", - cause - ))), - } - } - - #[async_backtrace::framed] - pub async fn read_meta_data( - &self, - dal: Operator, - filename: &str, - filesize: u64, - ) -> Result> { - let reader = MetaDataReader::meta_data_reader(dal); - - let load_params = LoadParams { - location: filename.to_owned(), - len_hint: Some(filesize), - ver: 0, - put_cache: true, - }; - - reader.read(&load_params).await - } - - #[async_backtrace::framed] - pub async fn read_columns_data( - &self, - row_group: &RowGroupMetaData, - part: &HivePartInfo, - ) -> Result>> { - let mut join_handlers = Vec::with_capacity(self.projection.len()); - - let semaphore = Arc::new(Semaphore::new(10)); - for index in &self.projection { - let field = &self.arrow_schema.fields[*index]; - let column_meta = Self::get_parquet_column_metadata(row_group, &field.name)?; - let (start, len) = column_meta.byte_range(); - - join_handlers.push(Self::read_column( - self.operator.clone(), - part.filename.to_string(), - start, - len, - semaphore.clone(), - )); - } - - futures::future::try_join_all(join_handlers).await - } - - pub fn create_rowgroup_deserializer( - &self, - chunks: Vec>, - row_group: &RowGroupMetaData, - ) -> Result { - if self.projection.len() != chunks.len() { - return Err(ErrorCode::Internal( - "Columns chunk len must be equals projections len.", - )); - } - - let mut columns_array_iter = Vec::with_capacity(self.projection.len()); - - for (index, column_chunk) in chunks.into_iter().enumerate() { - let idx = self.projection[index]; - let field = self.arrow_schema.fields[idx].clone(); - let column_meta = Self::get_parquet_column_metadata(row_group, &field.name)?; - - columns_array_iter.push(Self::to_deserialize( - column_meta, - column_chunk, - row_group.num_rows(), - field, - self.chunk_size, - )?); - } - - let num_row = row_group.num_rows(); - let deserializer = RowGroupDeserializer::new(columns_array_iter, num_row, None); - Ok(DataBlockDeserializer::new(deserializer)) - } - - pub fn create_data_block( - &self, - row_group_iterator: &mut DataBlockDeserializer, - part: &HivePartInfo, - ) -> Result> { - row_group_iterator - .next_block(&self.projected_schema, &self.hive_partition_filler, part) - .map_err(|e| e.add_message(format!(" filename of hive part {}", part.filename))) - } - - pub fn get_all_datablocks( - &self, - mut rowgroup_deserializer: DataBlockDeserializer, - part: &HivePartInfo, - ) -> Result> { - let mut all_blocks = vec![]; - - while let Some(datablock) = self.create_data_block(&mut rowgroup_deserializer, part)? { - all_blocks.push(datablock); - } - - Ok(all_blocks) - } - - pub fn get_output_schema(&self) -> DataSchemaRef { - self.output_schema.clone() - } -} - -pub fn filter_hive_partition_from_partition_keys( - schema: TableSchemaRef, - projections: Vec, - partition_keys: &Option>, -) -> (Vec, Vec) { - match partition_keys { - Some(partition_keys) => { - let mut not_partitions = vec![]; - let mut partition_fields = vec![]; - for i in projections.into_iter() { - let field = schema.field(i); - if !partition_keys.contains(field.name()) { - not_partitions.push(i); - } else { - partition_fields.push(field.clone()); - } - } - (not_partitions, partition_fields) - } - None => (projections, vec![]), - } -} diff --git a/src/query/storages/hive/hive/src/hive_partition.rs b/src/query/storages/hive/hive/src/hive_partition.rs index e9cd1a597d84..b8a615444ca0 100644 --- a/src/query/storages/hive/hive/src/hive_partition.rs +++ b/src/query/storages/hive/hive/src/hive_partition.rs @@ -17,22 +17,20 @@ use std::collections::hash_map::DefaultHasher; use std::collections::HashMap; use std::hash::Hash; use std::hash::Hasher; -use std::ops::Range; use std::sync::Arc; use databend_common_catalog::plan::PartInfo; use databend_common_catalog::plan::PartInfoPtr; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::Scalar; #[derive(serde::Serialize, serde::Deserialize, PartialEq, Eq, Debug, Clone)] pub struct HivePartInfo { // file location, like /usr/hive/warehouse/ssb.db/customer.table/c_region=ASIA/c_nation=CHINA/f00.parquet pub filename: String, // partition values, like 'c_region=ASIA/c_nation=CHINA' - pub partitions: Option, - // only the data in ranges belong to this partition - pub range: Range, + pub partitions: Vec, // file size pub filesize: u64, } @@ -57,24 +55,16 @@ impl PartInfo for HivePartInfo { } impl HivePartInfo { - pub fn create( - filename: String, - partitions: Option, - range: Range, - filesize: u64, - ) -> Arc> { - Arc::new(Box::new(HivePartInfo { + pub fn create(filename: String, partitions: Vec, filesize: u64) -> Self { + HivePartInfo { filename, partitions, - range, filesize, - })) + } } - pub fn get_partition_map(&self) -> HashMap { - self.partitions - .as_ref() - .map_or_else(HashMap::new, |s| parse_hive_partitions(s)) + pub fn into_part_ptr(self) -> PartInfoPtr { + Arc::new(Box::new(self)) } pub fn from_part(info: &PartInfoPtr) -> Result<&HivePartInfo> { @@ -90,7 +80,9 @@ pub fn parse_hive_partitions(partitions: &str) -> HashMap { let parts = partitions.split('/').collect::>(); for part in parts { let kv = part.split('=').collect::>(); - partition_map.insert(kv[0].to_string(), kv[1].to_string()); + if kv.len() == 2 { + partition_map.insert(kv[0].to_string(), kv[1].to_string()); + } } partition_map } diff --git a/src/query/storages/hive/hive/src/hive_partition_filler.rs b/src/query/storages/hive/hive/src/hive_partition_filler.rs index 41e1c51bfacd..dca5045724a7 100644 --- a/src/query/storages/hive/hive/src/hive_partition_filler.rs +++ b/src/query/storages/hive/hive/src/hive_partition_filler.rs @@ -14,14 +14,10 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::AnyType; -use databend_common_expression::BlockEntry; -use databend_common_expression::DataBlock; +use databend_common_expression::Scalar; use databend_common_expression::TableField; -use databend_common_expression::TableSchemaRef; -use databend_common_expression::Value; -use crate::hive_partition::HivePartInfo; +use crate::hive_partition::parse_hive_partitions; use crate::utils::str_field_to_scalar; #[derive(Debug, Clone)] @@ -30,27 +26,20 @@ pub struct HivePartitionFiller { } impl HivePartitionFiller { - pub fn create(_schema: TableSchemaRef, partition_fields: Vec) -> Self { + pub fn create(partition_fields: Vec) -> Self { HivePartitionFiller { partition_fields } } - fn generate_value( - &self, - _num_rows: usize, - value: String, - field: &TableField, - ) -> Result> { - let value = str_field_to_scalar(&value, &field.data_type().into())?; - Ok(Value::Scalar(value)) - } - - fn extract_partition_values(&self, hive_part: &HivePartInfo) -> Result> { - let partition_map = hive_part.get_partition_map(); + pub fn extract_scalars(&self, locations: &str) -> Result> { + let partition_map = parse_hive_partitions(locations); let mut partition_values = vec![]; for field in self.partition_fields.iter() { match partition_map.get(field.name()) { - Some(v) => partition_values.push(v.to_string()), + Some(v) => { + let value = str_field_to_scalar(v.as_str(), &field.data_type().into())?; + partition_values.push(value); + } None => { return Err(ErrorCode::TableInfoError(format!( "couldn't find hive partition info :{}, hive partition maps:{:?}", @@ -62,29 +51,4 @@ impl HivePartitionFiller { } Ok(partition_values) } - - pub fn fill_data( - &self, - data_block: DataBlock, - part: &HivePartInfo, - origin_num_rows: usize, - ) -> Result { - let data_values = self.extract_partition_values(part)?; - - // create column, create datafield - let mut num_rows = data_block.num_rows(); - if num_rows == 0 { - num_rows = origin_num_rows; - } - - let mut columns = data_block.columns().to_vec(); - - for (i, field) in self.partition_fields.iter().enumerate() { - let value = &data_values[i]; - let column = self.generate_value(num_rows, value.clone(), field)?; - columns.push(BlockEntry::new(field.data_type().into(), column)); - } - - Ok(DataBlock::new(columns, num_rows)) - } } diff --git a/src/query/storages/hive/hive/src/hive_partition_pruner.rs b/src/query/storages/hive/hive/src/hive_partition_pruner.rs deleted file mode 100644 index 4b75dfbb868a..000000000000 --- a/src/query/storages/hive/hive/src/hive_partition_pruner.rs +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; -use std::vec; - -use databend_common_catalog::table_context::TableContext; -use databend_common_exception::Result; -use databend_common_expression::Expr; -use databend_common_expression::TableSchema; -use databend_storages_common_index::RangeIndex; -use databend_storages_common_table_meta::meta::ColumnStatistics; -use databend_storages_common_table_meta::meta::StatisticsOfColumns; -use log::debug; - -use crate::utils::str_field_to_scalar; - -pub struct HivePartitionPruner { - pub ctx: Arc, - pub filter: Expr, - // pub partitions: Vec, - pub partition_schema: Arc, - pub full_schema: Arc, -} - -impl HivePartitionPruner { - pub fn create( - ctx: Arc, - filter: Expr, - partition_schema: Arc, - full_schema: Arc, - ) -> Self { - HivePartitionPruner { - ctx, - filter, - partition_schema, - full_schema, - } - } - - pub fn get_column_stats(&self, partitions: &Vec) -> Result> { - let mut data = Vec::with_capacity(partitions.len()); - for partition in partitions { - let mut stats = HashMap::new(); - for (index, singe_value) in partition.split('/').enumerate() { - let kv = singe_value.split('=').collect::>(); - let field = self.partition_schema.fields()[index].clone(); - let scalar = str_field_to_scalar(kv[1], &field.data_type().into())?; - let null_count = u64::from(scalar.is_null()); - let column_stats = - ColumnStatistics::new(scalar.clone(), scalar, null_count, 0, None); - stats.insert(index as u32, column_stats); - } - data.push(stats); - } - - Ok(data) - } - - pub fn prune(&self, partitions: Vec) -> Result> { - let range_filter = RangeIndex::try_create( - self.ctx.get_function_context()?, - &self.filter, - self.full_schema.clone(), - StatisticsOfColumns::default(), - )?; - let column_stats = self.get_column_stats(&partitions)?; - let mut filtered_partitions = vec![]; - for (idx, stats) in column_stats.into_iter().enumerate() { - let block_stats = stats - .iter() - .map(|(k, v)| { - let partition_col_name = self.partition_schema.field(*k as usize).name(); - let index = self.full_schema.index_of(partition_col_name).unwrap(); - - (index as u32, v.clone()) - }) - .collect(); - - if range_filter.apply(&block_stats, |_| false)? { - filtered_partitions.push(partitions[idx].clone()); - } - } - debug!("hive pruned partitions: {:?}", filtered_partitions); - Ok(filtered_partitions) - } -} diff --git a/src/query/storages/hive/hive/src/hive_table.rs b/src/query/storages/hive/hive/src/hive_table.rs index 07d30fb070d6..12fe4e01274b 100644 --- a/src/query/storages/hive/hive/src/hive_table.rs +++ b/src/query/storages/hive/hive/src/hive_table.rs @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; use std::sync::Arc; use std::time::Instant; use async_recursion::async_recursion; use databend_common_base::base::tokio::sync::Semaphore; use databend_common_catalog::catalog_kind::CATALOG_HIVE; +use databend_common_catalog::partition_columns::get_pushdown_without_partition_columns; use databend_common_catalog::plan::DataSourcePlan; +use databend_common_catalog::plan::ParquetReadOptions; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; use databend_common_catalog::plan::PartitionsShuffleKind; -use databend_common_catalog::plan::Projection; use databend_common_catalog::plan::PushDownInfo; use databend_common_catalog::table::NavigationPoint; use databend_common_catalog::table::Table; @@ -35,10 +35,10 @@ use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_expression::DataSchema; use databend_common_expression::DataSchemaRef; -use databend_common_expression::DataSchemaRefExt; use databend_common_expression::Expr; +use databend_common_expression::FieldIndex; +use databend_common_expression::TableField; use databend_common_expression::TableSchema; -use databend_common_expression::TableSchemaRef; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::UpdateStreamMetaReq; @@ -46,14 +46,14 @@ use databend_common_meta_app::schema::UpsertTableCopiedFileReq; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::Pipeline; -use databend_common_pipeline_core::SourcePipeBuilder; use databend_common_pipeline_sources::SyncSource; use databend_common_pipeline_sources::SyncSourcer; use databend_common_storage::init_operator; use databend_common_storage::DataOperator; -use databend_storages_common_index::RangeIndex; +use databend_common_storages_parquet::ParquetRSPruner; +use databend_common_storages_parquet::ParquetRSReaderBuilder; +use databend_storages_common_pruner::partition_prunner::PartitionPruner; use databend_storages_common_table_meta::meta::SnapshotId; -use databend_storages_common_table_meta::meta::StatisticsOfColumns; use databend_storages_common_table_meta::table::ChangeType; use futures::TryStreamExt; use log::info; @@ -63,13 +63,11 @@ use opendal::Metakey; use opendal::Operator; use super::hive_catalog::HiveCatalog; -use super::hive_partition_pruner::HivePartitionPruner; use super::hive_table_options::HiveTableOptions; -use crate::filter_hive_partition_from_partition_keys; -use crate::hive_parquet_block_reader::HiveBlockReader; use crate::hive_table_source::HiveTableSource; -use crate::HiveBlockFilter; -use crate::HiveFileSplitter; +use crate::utils::HiveFetchPartitionScalars; +use crate::HivePartInfo; +use crate::HivePartitionFiller; pub const HIVE_TABLE_ENGINE: &str = "hive"; pub const HIVE_DEFAULT_PARTITION: &str = "__HIVE_DEFAULT_PARTITION__"; @@ -96,283 +94,121 @@ impl HiveTable { }) } - fn get_block_filter( - &self, - ctx: Arc, - push_downs: &Option, - ) -> Result> { - let enable_hive_parquet_predict_pushdown = ctx - .get_settings() - .get_enable_hive_parquet_predict_pushdown()?; - - if enable_hive_parquet_predict_pushdown == 0 { - return Ok(Arc::new(HiveBlockFilter::create( - None, - vec![], - self.table_info.schema(), - ))); - } - - let filter_expression = push_downs.as_ref().and_then(|extra| { - extra - .filters - .as_ref() - .map(|filter| filter.filter.as_expr(&BUILTIN_FUNCTIONS)) - }); - - let range_filter = match filter_expression { - Some(expr) => Some(RangeIndex::try_create( - ctx.get_function_context()?, - &expr, - self.table_info.schema(), - StatisticsOfColumns::default(), - )?), - _ => None, - }; - - let projection = self.get_projections(push_downs)?; - let mut projection_fields = vec![]; - let schema = self.table_info.schema(); - for i in projection.into_iter() { - let field = schema.field(i); - projection_fields.push(field.clone()); - } - - Ok(Arc::new(HiveBlockFilter::create( - range_filter, - projection_fields, - self.table_info.schema(), - ))) - } - - fn is_prewhere_column_partition_keys( - &self, - schema: TableSchemaRef, - push_downs: &Option, - ) -> Result { - match push_downs { - None => Ok(false), - Some(p) => match &p.prewhere { - None => Ok(false), - Some(prewhere_info) => match &prewhere_info.prewhere_columns { - Projection::Columns(projections) => { - let partition_keys = &self.table_options.partition_keys; - let (not_partitions, _) = filter_hive_partition_from_partition_keys( - schema, - projections.clone(), - partition_keys, - ); - Ok(not_partitions.is_empty()) - } - Projection::InnerColumns(_) => { - Err(ErrorCode::Unimplemented("not support intercolumns")) - } - }, - }, - } + fn partition_fields(&self) -> Vec { + self.schema() + .fields() + .iter() + .filter(|field| { + self.table_options + .partition_keys + .as_ref() + .map(|ks| ks.contains(&field.name)) + .unwrap_or_default() + }) + .cloned() + .collect() } - #[inline] - pub fn do_read2( + fn no_partition_schema(&self) -> Arc { + let non_partition_fields = self + .schema() + .fields() + .iter() + .filter(|field| { + !self + .table_options + .partition_keys + .as_ref() + .map(|ks| ks.contains(&field.name)) + .unwrap_or_default() + }) + .cloned() + .collect(); + Arc::new(TableSchema::new(non_partition_fields)) + } + + pub fn do_read_data( &self, ctx: Arc, plan: &DataSourcePlan, pipeline: &mut Pipeline, ) -> Result<()> { - let push_downs = &plan.push_downs; - let chunk_size = ctx.get_settings().get_hive_parquet_chunk_size()? as usize; - let parts_len = plan.parts.len(); let max_threads = ctx.get_settings().get_max_threads()? as usize; let max_threads = std::cmp::min(parts_len, max_threads); + let table_schema = self.no_partition_schema(); - let mut source_builder = SourcePipeBuilder::create(); - let delay_timer = if self.is_simple_select_query(plan) { - // 0, 0, 200, 200, 400,400 - |x: usize| (x / 2).min(10) * 200 - } else { - |_| 0 - }; + let arrow_schema = table_schema.as_ref().into(); + let leaf_fields = Arc::new(table_schema.leaf_fields()); - let output_schema = Arc::new(DataSchema::from(plan.schema())); + let mut read_options = ParquetReadOptions::default(); - let prewhere_all_partitions = - self.is_prewhere_column_partition_keys(self.table_info.schema(), &plan.push_downs)?; - // create prewhere&remaindata block reader - let prewhere_reader = - self.build_prewhere_reader(plan, chunk_size, prewhere_all_partitions)?; - let remain_reader = self.build_remain_reader(plan, chunk_size, prewhere_all_partitions)?; - let prewhere_filter = - self.build_prewhere_filter_executor(plan, prewhere_reader.get_output_schema())?; - - let hive_block_filter = self.get_block_filter(ctx.clone(), push_downs)?; - - let mut src_fields = prewhere_reader.get_output_schema().fields().clone(); - if let Some(reader) = remain_reader.as_ref() { - let remain_field = reader.get_output_schema().fields().clone(); - src_fields.extend_from_slice(&remain_field); + if !ctx.get_settings().get_enable_parquet_page_index()? { + read_options = read_options.with_prune_pages(false); } - let src_schema = DataSchemaRefExt::create(src_fields); - for index in 0..std::cmp::max(1, max_threads) { - let output = OutputPort::create(); - source_builder.add_source( - output.clone(), - HiveTableSource::create( - ctx.clone(), - self.dal.clone(), - output, - prewhere_reader.clone(), - remain_reader.clone(), - prewhere_filter.clone(), - delay_timer(index), - hive_block_filter.clone(), - src_schema.clone(), - output_schema.clone(), - )?, - ); + if !ctx.get_settings().get_enable_parquet_rowgroup_pruning()? { + read_options = read_options.with_prune_row_groups(false); } - pipeline.add_pipe(source_builder.finalize()); - Ok(()) - } - - // simple select query is the sql likes `select * from xx limit 10` or - // `select * from xx where p_date = '20220201' limit 10` where p_date is a partition column; - // we just need to read a few data from table - fn is_simple_select_query(&self, plan: &DataSourcePlan) -> bool { - // couldn't get groupby order by info - if let Some(PushDownInfo { - filters, - limit: Some(lm), - .. - }) = &plan.push_downs - { - if *lm > 100000 { - return false; - } - - // filter out the partition column related expressions - let partition_keys = self.get_partition_key_sets(); - let columns = filters - .as_ref() - .map(|f| { - let expr = f.filter.as_expr(&BUILTIN_FUNCTIONS); - expr.column_refs().keys().cloned().collect::>() - }) - .unwrap_or_default(); - - if columns.difference(&partition_keys).count() == 0 { - return true; - } + if !ctx.get_settings().get_enable_parquet_prewhere()? { + read_options = read_options.with_do_prewhere(false); } - false - } - fn get_partition_key_sets(&self) -> HashSet { - self.table_options + let pruner = ParquetRSPruner::try_create( + ctx.get_function_context()?, + table_schema.clone(), + leaf_fields, + &plan.push_downs, + read_options, + self.table_options + .partition_keys + .clone() + .unwrap_or_default(), + )?; + + let op = self.dal.clone(); + + let partition_keys = self + .table_options .partition_keys .clone() - .unwrap_or_default() - .into_iter() - .collect() - } - - fn get_projections(&self, push_downs: &Option) -> Result> { - if let Some(PushDownInfo { - projection: Some(prj), - .. - }) = push_downs - { - match prj { - Projection::Columns(indices) => Ok(indices.clone()), - Projection::InnerColumns(_) => Err(ErrorCode::Unimplemented( - "does not support projection inner columns", - )), - } + .unwrap_or_default(); + + let partition_field_indexes: Result> = partition_keys + .iter() + .map(|name| self.schema().index_of(name)) + .collect(); + let partition_field_indexes = partition_field_indexes?; + let push_downs = if let Some(ref p) = plan.push_downs { + Some(get_pushdown_without_partition_columns( + p.clone(), + &partition_field_indexes[..], + )?) } else { - let col_ids = (0..self.table_info.schema().fields().len()).collect::>(); - Ok(col_ids) - } - } + None + }; + let mut builder = + ParquetRSReaderBuilder::create(ctx.clone(), op, table_schema, arrow_schema)? + .with_options(read_options) + .with_push_downs(push_downs.as_ref()) + .with_pruner(Some(pruner)) + .with_partition_columns(partition_keys); - // Build the prewhere reader. - fn build_prewhere_reader( - &self, - plan: &DataSourcePlan, - chunk_size: usize, - prewhere_all_partitions: bool, - ) -> Result> { - match ( - prewhere_all_partitions, - PushDownInfo::prewhere_of_push_downs(plan.push_downs.as_ref()), - ) { - (true, _) | (_, None) => { - let projection = PushDownInfo::projection_of_push_downs( - &plan.schema(), - plan.push_downs.as_ref(), - ); - HiveBlockReader::create( - self.dal.clone(), - self.table_info.schema(), - projection, - &self.table_options.partition_keys, - chunk_size, - ) - } - (false, Some(v)) => HiveBlockReader::create( - self.dal.clone(), - self.table_info.schema(), - v.prewhere_columns, - &self.table_options.partition_keys, - chunk_size, - ), - } - } + let parquet_reader = Arc::new(builder.build_full_reader()?); - // Build the prewhere filter executor. - fn build_prewhere_filter_executor( - &self, - plan: &DataSourcePlan, - schema: DataSchemaRef, - ) -> Result>> { - Ok(Arc::new( - PushDownInfo::prewhere_of_push_downs(plan.push_downs.as_ref()).map(|v| { - v.filter - .as_expr(&BUILTIN_FUNCTIONS) - .project_column_ref(|name| schema.index_of(name).unwrap()) - }), - )) - } - - // Build the remain reader. - fn build_remain_reader( - &self, - plan: &DataSourcePlan, - chunk_size: usize, - prewhere_all_partitions: bool, - ) -> Result>> { - Ok( - match ( - prewhere_all_partitions, - PushDownInfo::prewhere_of_push_downs(plan.push_downs.as_ref()), - ) { - (true, _) | (_, None) => Arc::new(None), - (false, Some(v)) => { - if v.remain_columns.is_empty() { - Arc::new(None) - } else { - let reader = HiveBlockReader::create( - self.dal.clone(), - self.table_info.schema(), - v.remain_columns, - &self.table_options.partition_keys, - chunk_size, - )?; - Arc::new(Some((*reader).clone())) - } - } + let output_schema = Arc::new(DataSchema::from(plan.schema())); + pipeline.add_source( + |output| { + HiveTableSource::create( + ctx.clone(), + output, + output_schema.clone(), + parquet_reader.clone(), + self.partition_fields(), + ) }, + max_threads.max(1), ) } @@ -415,9 +251,14 @@ impl HiveTable { if let Some(expr) = filter_expression { let partition_schemas = self.get_column_schemas(partition_keys.clone())?; - let partition_pruner = - HivePartitionPruner::create(ctx, expr, partition_schemas, self.table_info.schema()); - partition_names = partition_pruner.prune(partition_names)?; + let partition_pruner = PartitionPruner::try_create( + ctx.get_function_context()?, + expr, + partition_schemas, + self.table_info.schema(), + )?; + partition_names = + partition_pruner.prune::(partition_names)?; } trace!( @@ -479,7 +320,7 @@ impl HiveTable { async fn list_files_from_dirs( &self, dirs: Vec<(String, Option)>, - ) -> Result> { + ) -> Result> { let sem = Arc::new(Semaphore::new(60)); let mut tasks = Vec::with_capacity(dirs.len()); @@ -494,12 +335,9 @@ impl HiveTable { } let mut all_files = vec![]; - for (task, partition) in tasks { + for (task, _) in tasks { let files = task.await.unwrap()?; - for mut file in files { - file.add_partition(partition.clone()); - all_files.push(file); - } + all_files.extend_from_slice(&files); } Ok(all_files) @@ -516,11 +354,14 @@ impl HiveTable { let dirs = self.get_query_locations(ctx.clone(), &push_downs).await?; trace!("{} query locations: {:?}", dirs.len(), dirs); - let all_files = self.list_files_from_dirs(dirs).await?; - trace!("{} hive files: {:?}", all_files.len(), all_files); + let dir_len = dirs.len(); + let filler = HivePartitionFiller::create(self.partition_fields()); + let mut partitions = self.list_files_from_dirs(dirs).await?; + for partition in partitions.iter_mut() { + partition.partitions = filler.extract_scalars(&partition.filename)?; + } - let splitter = HiveFileSplitter::create(128 * 1024 * 1024_u64); - let partitions = splitter.get_splits(all_files); + trace!("{} hive files: {:?}", partitions.len(), partitions); info!( "read partition, partition num:{}, elapsed:{:?}", @@ -528,8 +369,26 @@ impl HiveTable { start.elapsed() ); + let estimated_read_rows: f64 = partitions + .iter() + .map(|s| s.filesize as f64 / (self.schema().num_fields() * 8) as f64) + .sum(); + + let read_bytes = partitions.iter().map(|s| s.filesize as usize).sum(); + let stats = PartStatistics::new_estimated( + None, + estimated_read_rows as _, + read_bytes, + partitions.len(), + dir_len, + ); + let partitions = partitions + .into_iter() + .map(HivePartInfo::into_part_ptr) + .collect(); + Ok(( - Default::default(), + stats, Partitions::create(PartitionsShuffleKind::Seq, partitions), )) } @@ -578,7 +437,7 @@ impl Table for HiveTable { pipeline: &mut Pipeline, _put_cache: bool, ) -> Result<()> { - self.do_read2(ctx, plan, pipeline) + self.do_read_data(ctx, plan, pipeline) } fn commit_insertion( @@ -665,27 +524,6 @@ impl SyncSource for HiveSource { } } -#[derive(Debug)] -pub struct HiveFileInfo { - pub filename: String, - pub length: u64, - pub partition: Option, -} - -impl HiveFileInfo { - pub fn create(filename: String, length: u64) -> Self { - HiveFileInfo { - filename, - length, - partition: None, - } - } - - pub fn add_partition(&mut self, partition: Option) { - self.partition = partition; - } -} - // convert hdfs path format to opendal path formatted // // there are two rules: @@ -731,7 +569,7 @@ async fn list_files_from_dir( operator: Operator, location: String, sem: Arc, -) -> Result> { +) -> Result> { let (files, dirs) = do_list_files_from_dir(operator.clone(), location, sem.clone()).await?; let mut all_files = files; let mut tasks = Vec::with_capacity(dirs.len()); @@ -759,7 +597,7 @@ async fn do_list_files_from_dir( operator: Operator, location: String, sem: Arc, -) -> Result<(Vec, Vec)> { +) -> Result<(Vec, Vec)> { let _a = sem.acquire().await.unwrap(); let mut m = operator .lister_with(&location) @@ -779,9 +617,9 @@ async fn do_list_files_from_dir( match meta.mode() { EntryMode::FILE => { - let filename = path.to_string(); + let location = path.to_string(); let length = meta.content_length(); - all_files.push(HiveFileInfo::create(filename, length)); + all_files.push(HivePartInfo::create(location, vec![], length)); } EntryMode::DIR => { all_dirs.push(path.to_string()); diff --git a/src/query/storages/hive/hive/src/hive_table_source.rs b/src/query/storages/hive/hive/src/hive_table_source.rs index db18d0180cc5..ae961f73e2d2 100644 --- a/src/query/storages/hive/hive/src/hive_table_source.rs +++ b/src/query/storages/hive/hive/src/hive_table_source.rs @@ -14,255 +14,93 @@ use std::any::Any; use std::sync::Arc; -use std::vec; -use databend_common_base::base::tokio::time::sleep; -use databend_common_base::base::tokio::time::Duration; use databend_common_base::base::Progress; use databend_common_base::base::ProgressValues; use databend_common_base::runtime::profile::Profile; use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_catalog::plan::PartInfoPtr; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::filter_helper::FilterHelpers; -use databend_common_expression::types::BooleanType; -use databend_common_expression::types::DataType; +use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; +use databend_common_expression::DataSchema; use databend_common_expression::DataSchemaRef; -use databend_common_expression::Evaluator; -use databend_common_expression::Expr; +use databend_common_expression::FieldIndex; +use databend_common_expression::TableField; use databend_common_expression::Value; -use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; -use log::debug; -use opendal::Operator; +use databend_common_storages_parquet::ParquetFileReader; +use databend_common_storages_parquet::ParquetRSFullReader; +use parquet::arrow::async_reader::ParquetRecordBatchStream; -use crate::hive_parquet_block_reader::DataBlockDeserializer; -use crate::hive_parquet_block_reader::HiveBlockReader; -use crate::HiveBlockFilter; -use crate::HiveBlocks; use crate::HivePartInfo; -struct PreWhereData { - data_blocks: Vec, - valids: Vec>, -} - -enum State { - /// Read parquet file meta data - /// IO bound - ReadMeta(Option), - - /// Read prewhere blocks from data groups (without deserialization) - /// IO bound - ReadPrewhereData(HiveBlocks), - - /// Read remain blocks from data groups (without deserialization) - /// IO bound - ReadRemainData(HiveBlocks, PreWhereData), - - /// do prewhere filter on prewhere data, if data are filtered, trans to Generated state with empty datablocks, - /// else trans to ReadRemainData - /// CPU bound - PrewhereFilter(HiveBlocks, DataBlockDeserializer), - - /// Deserialize remain block from the given data groups, concat prewhere and remain data blocks - /// CPU bound - Deserialize(HiveBlocks, DataBlockDeserializer, PreWhereData), - - /// indicates that data blocks are ready, and needs to be consumed - Generated(HiveBlocks, Vec), - Finish, -} +pub type PartitionColumnIndex = usize; pub struct HiveTableSource { - state: State, - ctx: Arc, - dal: Operator, - scan_progress: Arc, - prewhere_block_reader: Arc, - remain_reader: Arc>, - prewhere_filter: Arc>, output: Arc, - delay: usize, - hive_block_filter: Arc, + generated_data: Option, + is_finished: bool, - /// The schema before output. Some fields might be removed when outputting. - source_schema: DataSchemaRef, - /// The final output schema + scan_progress: Arc, + // Used for get partition + ctx: Arc, + + // Used to read parquet file. + parquet_reader: Arc, + + // Used to insert partition_block_entries to data block + // FieldIndex is the index in the output_schema + // PartitionColumnIndex is the index of in partition_fields and partition_block_entries + // order by FieldIndex so we can insert in order + output_partition_columns: Vec<(FieldIndex, PartitionColumnIndex)>, + partition_fields: Vec, + // Used to check schema output_schema: DataSchemaRef, + + // Per partition + stream: Option>, + partition_block_entries: Vec, } impl HiveTableSource { - #[allow(clippy::too_many_arguments)] pub fn create( ctx: Arc, - dal: Operator, output: Arc, - prewhere_block_reader: Arc, - remain_reader: Arc>, - prewhere_filter: Arc>, - delay: usize, - hive_block_filter: Arc, - source_schema: DataSchemaRef, output_schema: DataSchemaRef, + parquet_reader: Arc, + partition_fields: Vec, ) -> Result { + let output_partition_columns = output_schema + .fields() + .iter() + .enumerate() + .filter_map(|(fi, f)| { + partition_fields + .iter() + .position(|p| p.name() == f.name()) + .map(|pi| (fi, pi)) + }) + .collect(); let scan_progress = ctx.get_scan_progress(); Ok(ProcessorPtr::create(Box::new(HiveTableSource { - ctx, - dal, output, - prewhere_block_reader, - remain_reader, - prewhere_filter, - hive_block_filter, scan_progress, - state: State::ReadMeta(None), - delay, - source_schema, + ctx, + parquet_reader, output_schema, + partition_fields, + output_partition_columns, + stream: None, + generated_data: None, + is_finished: false, + partition_block_entries: vec![], }))) } - - fn try_get_partitions(&mut self) { - self.state = self - .ctx - .get_partition() - .map_or(State::Finish, |part_info| State::ReadMeta(Some(part_info))); - } - - fn exec_prewhere_filter( - &self, - filter: &Expr, - data_blocks: &Vec, - ) -> Result<(bool, Vec>)> { - assert_eq!(filter.data_type(), &DataType::Boolean); - - let mut valids = vec![]; - let mut exists = false; - let func_ctx = self.ctx.get_function_context()?; - for datablock in data_blocks { - let evaluator = Evaluator::new(datablock, &func_ctx, &BUILTIN_FUNCTIONS); - let predicates = evaluator - .run(filter) - .map_err(|e| e.add_message("eval prewhere filter failed:"))? - .try_downcast::() - .unwrap(); - - // shortcut, if predicates is const boolean (or can be cast to boolean) - if !FilterHelpers::is_all_unset(&predicates) { - exists = true; - } - - valids.push(predicates); - } - - assert_eq!(data_blocks.len(), valids.len()); - - Ok((exists, valids)) - } - - fn do_prewhere_filter( - &mut self, - hive_blocks: HiveBlocks, - rowgroup_deserializer: DataBlockDeserializer, - ) -> Result<()> { - // 1. deserialize chunks to datablocks - let prewhere_datablocks = self - .prewhere_block_reader - .get_all_datablocks(rowgroup_deserializer, &hive_blocks.part)?; - - let progress_values = ProgressValues { - rows: prewhere_datablocks.iter().map(|x| x.num_rows()).sum(), - bytes: prewhere_datablocks.iter().map(|x| x.memory_size()).sum(), - }; - Profile::record_usize_profile(ProfileStatisticsName::ScanBytes, progress_values.bytes); - self.scan_progress.incr(&progress_values); - - if let Some(filter) = self.prewhere_filter.as_ref() { - // 2. do filter - let (exists, valids) = self.exec_prewhere_filter(filter, &prewhere_datablocks)?; - // 3. if all data filter out, try next rowgroup, trans to prewhere data - if !exists { - // all rows in this block are filtered out - // turn to begin the next state cycle. - // Generate a empty block. - self.state = State::Generated(hive_blocks, vec![]); - return Ok(()); - } - // 4. if remain block is non, trans to generated state - if self.remain_reader.is_none() { - let prewhere_datablocks = prewhere_datablocks - .into_iter() - .zip(valids.iter()) - .map(|(datablock, valid)| { - let datablock = DataBlock::filter_boolean_value(datablock, valid).unwrap(); - datablock - .resort(&self.source_schema, &self.output_schema) - .unwrap() - }) - .filter(|x| !x.is_empty()) - .collect(); - - self.state = State::Generated(hive_blocks, prewhere_datablocks); - } else { - // 5. if not all data filter out, and remain block reader is not non, trans to read remain - self.state = State::ReadRemainData(hive_blocks, PreWhereData { - data_blocks: prewhere_datablocks, - valids, - }); - } - } else { - // if no prewhere filter, data should be all fetched in prewhere state - self.state = State::Generated(hive_blocks, prewhere_datablocks); - } - - Ok(()) - } - - fn do_deserialize( - &mut self, - hive_blocks: HiveBlocks, - rowgroup_deserializer: DataBlockDeserializer, - prewhere_data: PreWhereData, - ) -> Result<()> { - let datablocks = if let Some(remain_reader) = self.remain_reader.as_ref() { - // 1. deserialize all remain data block - let remain_datablocks = - remain_reader.get_all_datablocks(rowgroup_deserializer, &hive_blocks.part)?; - // 2. concat prewhere and remain datablock(may be none) - assert_eq!(remain_datablocks.len(), prewhere_data.data_blocks.len()); - - let allblocks = remain_datablocks - .iter() - .zip(prewhere_data.data_blocks.iter()) - .zip(prewhere_data.valids.iter()) - .map(|((r, p), v)| { - // do merge block - assert_eq!(r.num_rows(), p.num_rows()); - let mut a = p.clone(); - for column in r.columns().iter() { - a.add_column(column.clone()); - } - let a = DataBlock::filter_boolean_value(a, v).unwrap(); - a.resort(&self.source_schema, &self.output_schema).unwrap() - }) - .filter(|x| !x.is_empty()) - .collect::>(); - allblocks - } else { - return Err(ErrorCode::Internal("It's a bug. No remain reader")); - }; - - // 3 trans to generate state - self.state = State::Generated(hive_blocks, datablocks); - Ok(()) - } } #[async_trait::async_trait] @@ -276,8 +114,9 @@ impl Processor for HiveTableSource { } fn event(&mut self) -> Result { - if matches!(self.state, State::ReadMeta(None)) { - self.try_get_partitions(); + if self.is_finished { + self.output.finish(); + return Ok(Event::Finished); } if self.output.is_finished() { @@ -288,114 +127,108 @@ impl Processor for HiveTableSource { return Ok(Event::NeedConsume); } - if matches!(self.state, State::Generated(_, _)) { - if let State::Generated(mut hive_blocks, mut data_blocks) = - std::mem::replace(&mut self.state, State::Finish) - { - // 1. consume all generated blocks, - if let Some(data_block) = data_blocks.pop() { - self.output.push_data(Ok(data_block)); - // 2. if not all consumed, retain generated state - self.state = State::Generated(hive_blocks, data_blocks); - return Ok(Event::NeedConsume); - } - - // 3. if all consumed, try next rowgroup - hive_blocks.advance(); - match hive_blocks.has_blocks() { - true => { - self.state = State::ReadPrewhereData(hive_blocks); - } - false => { - self.try_get_partitions(); - } - } - } - } - - match self.state { - State::Finish => { - self.output.finish(); - Ok(Event::Finished) - } - State::ReadMeta(_) => Ok(Event::Async), - State::ReadPrewhereData(_) => Ok(Event::Async), - State::ReadRemainData(_, _) => Ok(Event::Async), - State::PrewhereFilter(_, _) => Ok(Event::Sync), - State::Deserialize(_, _, _) => Ok(Event::Sync), - State::Generated(_, _) => Err(ErrorCode::Internal("It's a bug.")), - } - } - - fn process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::PrewhereFilter(hive_blocks, rowgroup_deserializer) => { - self.do_prewhere_filter(hive_blocks, rowgroup_deserializer) - } - State::Deserialize(hive_blocks, rowgroup_deserializer, prewhere_data) => { - self.do_deserialize(hive_blocks, rowgroup_deserializer, prewhere_data) + match self.generated_data.take() { + None => Ok(Event::Async), + Some(data_block) => { + let progress_values = ProgressValues { + rows: data_block.num_rows(), + bytes: data_block.memory_size(), + }; + self.scan_progress.incr(&progress_values); + Profile::record_usize_profile( + ProfileStatisticsName::ScanBytes, + data_block.memory_size(), + ); + self.output.push_data(Ok(data_block)); + Ok(Event::NeedConsume) } - _ => Err(ErrorCode::Internal("It's a bug.")), } } #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::ReadMeta(Some(part)) => { - if self.delay > 0 { - sleep(Duration::from_millis(self.delay as u64)).await; - debug!("sleep for {}ms", self.delay); - self.delay = 0; - } - let part = HivePartInfo::from_part(&part)?; - let file_meta = self - .prewhere_block_reader - .read_meta_data(self.dal.clone(), &part.filename, part.filesize) - .await?; - let mut hive_blocks = - HiveBlocks::create(file_meta, part.clone(), self.hive_block_filter.clone()); - - match hive_blocks.prune() { - true => { - self.state = State::ReadPrewhereData(hive_blocks); - } - false => { - self.try_get_partitions(); + if let Some(mut stream) = self.stream.take() { + if let Some(block) = self + .parquet_reader + .read_block_from_stream(&mut stream) + .await? + .map(|b| { + let mut columns = b.columns().to_vec(); + for (fi, pi) in self.output_partition_columns.iter() { + columns.insert(*fi, self.partition_block_entries[*pi].clone()); } - } - Ok(()) - } - State::ReadPrewhereData(hive_blocks) => { - let row_group = hive_blocks.get_current_row_group_meta_data(); - let part = hive_blocks.get_part_info(); - let chunks = self - .prewhere_block_reader - .read_columns_data(row_group, &part) - .await?; - let rowgroup_deserializer = self - .prewhere_block_reader - .create_rowgroup_deserializer(chunks, row_group)?; - self.state = State::PrewhereFilter(hive_blocks, rowgroup_deserializer); - Ok(()) + DataBlock::new(columns, b.num_rows()) + }) + .map(|b| check_block_schema(&self.output_schema, b)) + .transpose()? + { + self.generated_data = Some(block); + self.stream = Some(stream); } + // else: + // If `read_block` returns `None`, it means the stream is finished. + // And we should try to build another stream (in next event loop). + } else if let Some(part) = self.ctx.get_partition() { + let part = HivePartInfo::from_part(&part)?; + let partition_fields = self + .partition_fields + .iter() + .cloned() + .zip(part.partitions.iter().cloned()) + .collect::>(); + self.partition_block_entries = partition_fields + .iter() + .map(|(f, v)| BlockEntry::new(f.data_type().into(), Value::Scalar(v.clone()))) + .collect::>(); + let stream = self + .parquet_reader + .prepare_data_stream(&part.filename, part.filesize, Some(&partition_fields)) + .await?; + self.stream = Some(stream); + } else { + self.is_finished = true; + } - State::ReadRemainData(hive_blocks, prewhere_data) => { - let row_group = hive_blocks.get_current_row_group_meta_data(); - let part = hive_blocks.get_part_info(); + Ok(()) + } +} - if let Some(remain_reader) = self.remain_reader.as_ref() { - let chunks = remain_reader.read_columns_data(row_group, &part).await?; - let rowgroup_deserializer = - remain_reader.create_rowgroup_deserializer(chunks, row_group)?; - self.state = - State::Deserialize(hive_blocks, rowgroup_deserializer, prewhere_data); - Ok(()) - } else { - Err(ErrorCode::Internal("It's a bug. No remain reader")) - } - } - _ => Err(ErrorCode::Internal("It's a bug.")), +fn check_block_schema(schema: &DataSchema, mut block: DataBlock) -> Result { + // Check if the schema of the data block is matched with the schema of the table. + if block.num_columns() != schema.num_fields() { + return Err(ErrorCode::TableSchemaMismatch(format!( + "Data schema mismatched. Data columns length: {}, schema fields length: {}", + block.num_columns(), + schema.num_fields() + ))); + } + + for (col, field) in block.columns_mut().iter_mut().zip(schema.fields().iter()) { + // If the actual data is nullable, the field must be nullbale. + if col.data_type.is_nullable_or_null() && !field.is_nullable() { + return Err(ErrorCode::TableSchemaMismatch(format!( + "Data schema mismatched (col name: {}). Data column is nullable, but schema field is not nullable", + field.name() + ))); + } + // The inner type of the data and field should be the same. + let data_type = col.data_type.remove_nullable(); + let schema_type = field.data_type().remove_nullable(); + if data_type != schema_type { + return Err(ErrorCode::TableSchemaMismatch(format!( + "Data schema mismatched (col name: {}). Data column type is {:?}, but schema field type is {:?}", + field.name(), + col.data_type, + field.data_type() + ))); + } + // If the field is nullable but the actual data is not nullable, + // we should wrap nullable for the data. + if field.is_nullable() && !col.data_type.is_nullable_or_null() { + col.data_type = col.data_type.wrap_nullable(); + col.value = col.value.clone().wrap_nullable(None); } } + + Ok(block) } diff --git a/src/query/storages/hive/hive/src/lib.rs b/src/query/storages/hive/hive/src/lib.rs index 189f90365ea0..9989e1c7bc75 100644 --- a/src/query/storages/hive/hive/src/lib.rs +++ b/src/query/storages/hive/hive/src/lib.rs @@ -17,29 +17,17 @@ #![allow(clippy::diverging_sub_expression)] mod converters; -mod hive_block_filter; -mod hive_blocks; mod hive_catalog; mod hive_database; -mod hive_file_splitter; -mod hive_meta_data_reader; -mod hive_parquet_block_reader; mod hive_partition; mod hive_partition_filler; -mod hive_partition_pruner; mod hive_table; mod hive_table_options; mod hive_table_source; mod utils; -pub use hive_block_filter::HiveBlockFilter; -pub use hive_blocks::HiveBlocks; pub use hive_catalog::HiveCatalog; pub use hive_catalog::HiveCreator; -pub use hive_file_splitter::HiveFileSplitter; -pub use hive_meta_data_reader::MetaDataReader; -pub use hive_parquet_block_reader::filter_hive_partition_from_partition_keys; pub use hive_partition::HivePartInfo; pub use hive_partition_filler::HivePartitionFiller; -pub use hive_table::HiveFileInfo; pub use hive_table::HiveTable; diff --git a/src/query/storages/hive/hive/src/utils.rs b/src/query/storages/hive/hive/src/utils.rs index c176fe165691..8c5eedcaab8f 100644 --- a/src/query/storages/hive/hive/src/utils.rs +++ b/src/query/storages/hive/hive/src/utils.rs @@ -14,13 +14,13 @@ use std::fmt::Debug; -use databend_common_base::base::OrderedFloat; +use databend_common_catalog::partition_columns::str_to_scalar; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::number::NumberScalar; use databend_common_expression::types::DataType; -use databend_common_expression::types::NumberDataType; use databend_common_expression::Scalar; +use databend_common_expression::TableField; +use databend_storages_common_pruner::partition_prunner::FetchPartitionScalars; use volo_thrift::MaybeException; use crate::hive_table::HIVE_DEFAULT_PARTITION; @@ -34,53 +34,36 @@ pub(crate) fn str_field_to_scalar(value: &str, data_type: &DataType) -> Result Ok(Scalar::String(value.to_string())), - DataType::Number(num_ty) => match num_ty { - NumberDataType::UInt8 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt8(num))) - } - NumberDataType::UInt16 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt16(num))) - } - NumberDataType::UInt32 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt32(num))) - } - NumberDataType::UInt64 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt64(num))) - } - NumberDataType::Int8 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int8(num))) - } - NumberDataType::Int16 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int16(num))) - } - NumberDataType::Int32 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int32(num))) - } - NumberDataType::Int64 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int64(num))) - } - NumberDataType::Float32 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Float32(OrderedFloat(num)))) - } - NumberDataType::Float64 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Float64(OrderedFloat(num)))) + _ => str_to_scalar(value, data_type), + } +} + +pub struct HiveFetchPartitionScalars; + +impl FetchPartitionScalars for HiveFetchPartitionScalars { + fn eval(value: &String, partition_fields: &[TableField]) -> Result> { + let mut res = Vec::new(); + let v = value.split('/'); + let mut idx = 0; + for singe_value in v { + let kv = singe_value.split('=').collect::>(); + if kv.len() == 2 { + let field = &partition_fields[idx]; + let scalar = str_field_to_scalar(kv[1], &field.data_type().into())?; + res.push(scalar); + idx += 1; } - }, - _ => Err(ErrorCode::Unimplemented(format!( - "generate scalar failed, {:?}", - data_type - ))), + } + if res.len() != partition_fields.len() { + Err(ErrorCode::ParquetFileInvalid(format!( + "Partition values mismatch, expect {}, got {} in {}", + partition_fields.len(), + res.len(), + value + ))) + } else { + Ok(res) + } } } diff --git a/src/query/storages/hive/hive/tests/it/hive_file_splitter.rs b/src/query/storages/hive/hive/tests/it/hive_file_splitter.rs deleted file mode 100644 index cf77bcf40614..000000000000 --- a/src/query/storages/hive/hive/tests/it/hive_file_splitter.rs +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_storages_hive::HiveFileSplitter; - -#[test] -fn test_splitter() { - let splitter = HiveFileSplitter::create(1024); - assert_eq!(splitter.split_length(1), vec![0..2]); - assert_eq!(splitter.split_length(1024), vec![0..1025]); - assert_eq!(splitter.split_length(1100), vec![0..1101]); - assert_eq!(splitter.split_length(1500), vec![0..1024, 1024..1501]); - assert_eq!(splitter.split_length(2048), vec![0..1024, 1024..2049]); - assert_eq!(splitter.split_length(3000), vec![ - 0..1024, - 1024..2048, - 2048..3001 - ]); -} diff --git a/src/query/storages/hive/hive/tests/it/main.rs b/src/query/storages/hive/hive/tests/it/main.rs deleted file mode 100644 index 78f1d926806f..000000000000 --- a/src/query/storages/hive/hive/tests/it/main.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod hive_file_splitter; diff --git a/src/query/storages/parquet/Cargo.toml b/src/query/storages/parquet/Cargo.toml index 2fa779fb9bf4..1d0a96c05a54 100644 --- a/src/query/storages/parquet/Cargo.toml +++ b/src/query/storages/parquet/Cargo.toml @@ -29,6 +29,7 @@ databend-common-metrics = { workspace = true } databend-common-pipeline-core = { workspace = true } databend-common-settings = { workspace = true } databend-common-storage = { workspace = true } +databend-storages-common-cache = { workspace = true } databend-storages-common-pruner = { workspace = true } databend-storages-common-stage = { workspace = true } databend-storages-common-table-meta = { workspace = true } diff --git a/src/query/storages/parquet/src/lib.rs b/src/query/storages/parquet/src/lib.rs index 6807e9508473..a7358ae8485b 100644 --- a/src/query/storages/parquet/src/lib.rs +++ b/src/query/storages/parquet/src/lib.rs @@ -30,15 +30,5 @@ mod utils; pub use parquet_part::ParquetFilesPart; pub use parquet_part::ParquetPart; -pub use parquet_rs::transform_record_batch; -pub use parquet_rs::InMemoryRowGroup; -pub use parquet_rs::ParquetFileReader; -pub use parquet_rs::ParquetRSFullReader; -pub use parquet_rs::ParquetRSPruner; -pub use parquet_rs::ParquetRSReaderBuilder; -pub use parquet_rs::ParquetRSRowGroupPart; -pub use parquet_rs::ParquetRSRowGroupReader; -pub use parquet_rs::ParquetRSTable; -pub use parquet_rs::ParquetSource; -pub use parquet_rs::ParquetTableForCopy; +pub use parquet_rs::*; pub use read_settings::ReadSettings; diff --git a/src/query/storages/parquet/src/parquet_rs/meta.rs b/src/query/storages/parquet/src/parquet_rs/meta.rs index e2780c97a0f6..da9231c1cb48 100644 --- a/src/query/storages/parquet/src/parquet_rs/meta.rs +++ b/src/query/storages/parquet/src/parquet_rs/meta.rs @@ -21,6 +21,10 @@ use databend_common_catalog::plan::FullParquetMeta; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::TableField; +use databend_storages_common_cache::CacheManager; +use databend_storages_common_cache::InMemoryItemCacheReader; +use databend_storages_common_cache::LoadParams; +use databend_storages_common_cache::Loader; use opendal::Operator; use parquet::file::metadata::ParquetMetaData; use parquet::schema::types::SchemaDescPtr; @@ -28,6 +32,21 @@ use parquet::schema::types::SchemaDescriptor; use crate::parquet_rs::statistics::collect_row_group_stats; +pub async fn read_metadata_async_cached( + path: &str, + operator: &Operator, + file_size: Option, +) -> Result> { + let reader = MetaReader::meta_data_reader(operator.clone()); + let load_params = LoadParams { + location: path.to_owned(), + len_hint: file_size, + ver: 0, + put_cache: true, + }; + reader.read(&load_params).await +} + #[async_backtrace::framed] pub async fn read_metas_in_parallel( op: &Operator, @@ -153,15 +172,14 @@ async fn load_and_check_parquet_meta( expect: &SchemaDescriptor, schema_from: &str, ) -> Result> { - let metadata = - databend_common_storage::parquet_rs::read_metadata_async(file, &op, Some(size)).await?; + let metadata = read_metadata_async_cached(file, &op, Some(size)).await?; check_parquet_schema( expect, metadata.file_metadata().schema_descr(), file, schema_from, )?; - Ok(Arc::new(metadata)) + Ok(metadata) } pub async fn read_parquet_metas_batch( @@ -200,10 +218,7 @@ pub async fn read_parquet_metas_batch_for_copy( ) -> Result>> { let mut metas = Vec::with_capacity(file_infos.len()); for (location, size) in file_infos { - let meta = Arc::new( - databend_common_storage::parquet_rs::read_metadata_async(&location, &op, Some(size)) - .await?, - ); + let meta = read_metadata_async_cached(&location, &op, Some(size)).await?; if unlikely(meta.file_metadata().num_rows() == 0) { // Don't collect empty files continue; @@ -230,3 +245,33 @@ fn check_memory_usage(max_memory_usage: u64) -> Result<()> { } Ok(()) } + +pub struct LoaderWrapper(T); +pub type ParquetMetaReader = InMemoryItemCacheReader>; + +pub struct MetaReader; +impl MetaReader { + pub fn meta_data_reader(dal: Operator) -> ParquetMetaReader { + ParquetMetaReader::new( + CacheManager::instance().get_parquet_meta_data_cache(), + LoaderWrapper(dal), + ) + } +} + +#[async_trait::async_trait] +impl Loader for LoaderWrapper { + #[async_backtrace::framed] + async fn load(&self, params: &LoadParams) -> Result { + let size = match params.len_hint { + Some(v) => v, + None => self.0.stat(¶ms.location).await?.content_length(), + }; + databend_common_storage::parquet_rs::read_metadata_async( + ¶ms.location, + &self.0, + Some(size), + ) + .await + } +} diff --git a/src/query/storages/parquet/src/parquet_rs/mod.rs b/src/query/storages/parquet/src/parquet_rs/mod.rs index 459024759223..97e5f259c591 100644 --- a/src/query/storages/parquet/src/parquet_rs/mod.rs +++ b/src/query/storages/parquet/src/parquet_rs/mod.rs @@ -24,6 +24,8 @@ mod meta; mod schema; pub use copy_into_table::ParquetTableForCopy; +pub use meta::read_metadata_async_cached; +pub use meta::read_metas_in_parallel; pub use meta::read_metas_in_parallel_for_copy; pub use meta::read_parquet_metas_batch; pub use parquet_reader::transform_record_batch; @@ -36,3 +38,5 @@ pub use parquet_table::ParquetRSTable; pub use partition::ParquetRSRowGroupPart; pub use pruning::ParquetRSPruner; pub use source::ParquetSource; +pub use statistics::collect_row_group_stats; +pub use statistics::collect_single_row_group_stats; diff --git a/src/query/storages/parquet/src/parquet_rs/pruning.rs b/src/query/storages/parquet/src/parquet_rs/pruning.rs index ec00657e303b..284091287e48 100644 --- a/src/query/storages/parquet/src/parquet_rs/pruning.rs +++ b/src/query/storages/parquet/src/parquet_rs/pruning.rs @@ -85,8 +85,6 @@ impl ParquetRSPruner { partition_columns .iter() .position(|c| c.eq_ignore_ascii_case(&name)) - .unwrap(); - None }) }) .collect::>(); diff --git a/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs b/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs index 6d1678a2032e..9fa2d78fefc3 100644 --- a/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs +++ b/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs @@ -19,3 +19,4 @@ mod utils; pub use page::convert_index_to_column_statistics; pub use row_group::collect_row_group_stats; +pub use row_group::collect_single_row_group_stats; diff --git a/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs b/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs index f1bac90e2a35..3199627cf9cd 100644 --- a/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs +++ b/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs @@ -41,35 +41,41 @@ pub fn collect_row_group_stats( let mut stats = Vec::with_capacity(rgs.len()); for rg in rgs { assert_eq!(rg.num_columns(), leaf_fields.len()); - let mut stats_of_columns = HashMap::with_capacity(rg.columns().len()); + let stats_of_columns = collect_single_row_group_stats(rg, leaf_fields, columns)?; + stats.push(stats_of_columns); + } + Some(stats) +} - // Each row_group_stat is a `HashMap` holding key-value pairs. - // The first element of the pair is the offset in the schema, - // and the second element is the statistics of the column (according to the offset) - if let Some(columns) = columns { - for col_idx in columns.iter() { - let column = rg.column(*col_idx); - let field = &leaf_fields[*col_idx]; - let column_stats = column.statistics().unwrap(); - stats_of_columns.insert( - *col_idx as u32, - convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, - ); - } - } else { - for (col_idx, (column, field)) in - rg.columns().iter().zip(leaf_fields.iter()).enumerate() - { - let column_stats = column.statistics().unwrap(); - stats_of_columns.insert( - col_idx as u32, - convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, - ); - } +/// Note the keys of result is not column id but column offset in schema +pub fn collect_single_row_group_stats( + rg: &RowGroupMetaData, + leaf_fields: &[TableField], + columns: Option<&[usize]>, +) -> Option { + let mut stats_of_columns = HashMap::with_capacity(rg.columns().len()); + // Each row_group_stat is a `HashMap` holding key-value pairs. + // The first element of the pair is the offset in the schema, + // and the second element is the statistics of the column (according to the offset) + if let Some(columns) = columns { + for col_idx in columns.iter() { + let column = rg.column(*col_idx); + let field = &leaf_fields[*col_idx]; + let column_stats = column.statistics().unwrap(); + stats_of_columns.insert( + *col_idx as u32, + convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, + ); + } + } else { + for (idx, (column, field)) in rg.columns().iter().zip(leaf_fields.iter()).enumerate() { + let column_stats = column.statistics().unwrap(); + stats_of_columns.insert( + idx as u32, + convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, + ); } - - stats.push(stats_of_columns); } - Some(stats) + Some(stats_of_columns) } diff --git a/src/query/storages/system/src/caches_table.rs b/src/query/storages/system/src/caches_table.rs index ead0282a6aef..04c411675dfa 100644 --- a/src/query/storages/system/src/caches_table.rs +++ b/src/query/storages/system/src/caches_table.rs @@ -80,7 +80,7 @@ impl SyncSystemTable for CachesTable { let inverted_index_meta_cache = cache_manager.get_inverted_index_meta_cache(); let inverted_index_file_cache = cache_manager.get_inverted_index_file_cache(); let prune_partitions_cache = cache_manager.get_prune_partitions_cache(); - let file_meta_data_cache = cache_manager.get_file_meta_data_cache(); + let parquet_meta_data_cache = cache_manager.get_parquet_meta_data_cache(); let table_data_cache = cache_manager.get_table_data_cache(); let table_column_array_cache = cache_manager.get_table_data_array_cache(); @@ -121,8 +121,8 @@ impl SyncSystemTable for CachesTable { Self::append_row(&prune_partitions_cache, &local_node, &mut columns); } - if let Some(file_meta_data_cache) = file_meta_data_cache { - Self::append_row(&file_meta_data_cache, &local_node, &mut columns); + if let Some(parquet_meta_data_cache) = parquet_meta_data_cache { + Self::append_row(&parquet_meta_data_cache, &local_node, &mut columns); } if let Some(cache) = table_data_cache { diff --git a/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py b/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py index 8b9fc86d6b41..4f4462b76178 100644 --- a/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py +++ b/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py @@ -1,132 +1,159 @@ from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType, DecimalType +from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + DoubleType, + StringType, + DateType, + DecimalType, +) data_path = "tests/sqllogictests/data/tests/suites/0_stateless/13_tpch/data" -spark = SparkSession.builder \ - .appName("CSV to Iceberg REST Catalog") \ - .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \ - .config("spark.sql.catalog.iceberg.type", "rest") \ - .config("spark.sql.catalog.iceberg.uri", "http://127.0.0.1:8181") \ - .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \ - .config("spark.sql.catalog.iceberg.warehouse", "s3://iceberg-tpch/") \ - .config("spark.sql.catalog.iceberg.s3.access-key-id", "admin") \ - .config("spark.sql.catalog.iceberg.s3.secret-access-key", "password") \ - .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \ - .config("spark.sql.catalog.iceberg.s3.endpoint", "http://127.0.0.1:9000") \ - .config("spark.sql.catalog.iceberg.client.region", "us-east-1") \ - .config("spark.jars.packages", - "org.apache.iceberg:iceberg-aws-bundle:1.6.1,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1") \ +spark = ( + SparkSession.builder.appName("CSV to Iceberg REST Catalog") + .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") + .config("spark.sql.catalog.iceberg.type", "rest") + .config("spark.sql.catalog.iceberg.uri", "http://127.0.0.1:8181") + .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") + .config("spark.sql.catalog.iceberg.warehouse", "s3://iceberg-tpch/") + .config("spark.sql.catalog.iceberg.s3.access-key-id", "admin") + .config("spark.sql.catalog.iceberg.s3.secret-access-key", "password") + .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") + .config("spark.sql.catalog.iceberg.s3.endpoint", "http://127.0.0.1:9000") + .config("spark.sql.catalog.iceberg.client.region", "us-east-1") + .config( + "spark.jars.packages", + "org.apache.iceberg:iceberg-aws-bundle:1.6.1,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1", + ) .getOrCreate() +) tables = { "lineitem": ( - StructType([ - StructField("l_orderkey", IntegerType(), True), - StructField("l_partkey", IntegerType(), True), - StructField("l_suppkey", IntegerType(), True), - StructField("l_linenumber", IntegerType(), True), - StructField("l_quantity", DecimalType(15, 2), True), - StructField("l_extendedprice", DecimalType(15, 2), True), - StructField("l_discount", DecimalType(15, 2), True), - StructField("l_tax", DecimalType(15, 2), True), - StructField("l_returnflag", StringType(), True), - StructField("l_linestatus", StringType(), True), - StructField("l_shipdate", DateType(), True), - StructField("l_commitdate", DateType(), True), - StructField("l_receiptdate", DateType(), True), - StructField("l_shipinstruct", StringType(), True), - StructField("l_shipmode", StringType(), True), - StructField("l_comment", StringType(), True) - ]), - f"{data_path}/lineitem.tbl" + StructType( + [ + StructField("l_orderkey", IntegerType(), True), + StructField("l_partkey", IntegerType(), True), + StructField("l_suppkey", IntegerType(), True), + StructField("l_linenumber", IntegerType(), True), + StructField("l_quantity", DecimalType(15, 2), True), + StructField("l_extendedprice", DecimalType(15, 2), True), + StructField("l_discount", DecimalType(15, 2), True), + StructField("l_tax", DecimalType(15, 2), True), + StructField("l_returnflag", StringType(), True), + StructField("l_linestatus", StringType(), True), + StructField("l_shipdate", DateType(), True), + StructField("l_commitdate", DateType(), True), + StructField("l_receiptdate", DateType(), True), + StructField("l_shipinstruct", StringType(), True), + StructField("l_shipmode", StringType(), True), + StructField("l_comment", StringType(), True), + ] + ), + f"{data_path}/lineitem.tbl", ), "orders": ( - StructType([ - StructField("o_orderkey", IntegerType(), True), - StructField("o_custkey", IntegerType(), True), - StructField("o_orderstatus", StringType(), True), - StructField("o_totalprice", DecimalType(15, 2), True), - StructField("o_orderdate", DateType(), True), - StructField("o_orderpriority", StringType(), True), - StructField("o_clerk", StringType(), True), - StructField("o_shippriority", IntegerType(), True), - StructField("o_comment", StringType(), True) - ]), - f"{data_path}/orders.tbl" + StructType( + [ + StructField("o_orderkey", IntegerType(), True), + StructField("o_custkey", IntegerType(), True), + StructField("o_orderstatus", StringType(), True), + StructField("o_totalprice", DecimalType(15, 2), True), + StructField("o_orderdate", DateType(), True), + StructField("o_orderpriority", StringType(), True), + StructField("o_clerk", StringType(), True), + StructField("o_shippriority", IntegerType(), True), + StructField("o_comment", StringType(), True), + ] + ), + f"{data_path}/orders.tbl", ), "customer": ( - StructType([ - StructField("c_custkey", IntegerType(), True), - StructField("c_name", StringType(), True), - StructField("c_address", StringType(), True), - StructField("c_nationkey", IntegerType(), True), - StructField("c_phone", StringType(), True), - StructField("c_acctbal", DecimalType(15, 2), True), - StructField("c_mktsegment", StringType(), True), - StructField("c_comment", StringType(), True) - ]), - f"{data_path}/customer.tbl" + StructType( + [ + StructField("c_custkey", IntegerType(), True), + StructField("c_name", StringType(), True), + StructField("c_address", StringType(), True), + StructField("c_nationkey", IntegerType(), True), + StructField("c_phone", StringType(), True), + StructField("c_acctbal", DecimalType(15, 2), True), + StructField("c_mktsegment", StringType(), True), + StructField("c_comment", StringType(), True), + ] + ), + f"{data_path}/customer.tbl", ), "nation": ( - StructType([ - StructField("n_nationkey", IntegerType(), True), - StructField("n_name", StringType(), True), - StructField("n_regionkey", IntegerType(), True), - StructField("n_comment", StringType(), True) - ]), - f"{data_path}/nation.tbl" + StructType( + [ + StructField("n_nationkey", IntegerType(), True), + StructField("n_name", StringType(), True), + StructField("n_regionkey", IntegerType(), True), + StructField("n_comment", StringType(), True), + ] + ), + f"{data_path}/nation.tbl", ), "region": ( - StructType([ - StructField("r_regionkey", IntegerType(), True), - StructField("r_name", StringType(), True), - StructField("r_comment", StringType(), True) - ]), - f"{data_path}/region.tbl" + StructType( + [ + StructField("r_regionkey", IntegerType(), True), + StructField("r_name", StringType(), True), + StructField("r_comment", StringType(), True), + ] + ), + f"{data_path}/region.tbl", ), "part": ( - StructType([ - StructField("p_partkey", IntegerType(), True), - StructField("p_name", StringType(), True), - StructField("p_mfgr", StringType(), True), - StructField("p_brand", StringType(), True), - StructField("p_type", StringType(), True), - StructField("p_size", IntegerType(), True), - StructField("p_container", StringType(), True), - StructField("p_retailprice", DecimalType(15, 2), True), - StructField("p_comment", StringType(), True) - ]), - f"{data_path}/part.tbl" + StructType( + [ + StructField("p_partkey", IntegerType(), True), + StructField("p_name", StringType(), True), + StructField("p_mfgr", StringType(), True), + StructField("p_brand", StringType(), True), + StructField("p_type", StringType(), True), + StructField("p_size", IntegerType(), True), + StructField("p_container", StringType(), True), + StructField("p_retailprice", DecimalType(15, 2), True), + StructField("p_comment", StringType(), True), + ] + ), + f"{data_path}/part.tbl", ), "supplier": ( - StructType([ - StructField("s_suppkey", IntegerType(), True), - StructField("s_name", StringType(), True), - StructField("s_address", StringType(), True), - StructField("s_nationkey", IntegerType(), True), - StructField("s_phone", StringType(), True), - StructField("s_acctbal", DecimalType(15, 2), True), - StructField("s_comment", StringType(), True) - ]), - f"{data_path}/supplier.tbl" + StructType( + [ + StructField("s_suppkey", IntegerType(), True), + StructField("s_name", StringType(), True), + StructField("s_address", StringType(), True), + StructField("s_nationkey", IntegerType(), True), + StructField("s_phone", StringType(), True), + StructField("s_acctbal", DecimalType(15, 2), True), + StructField("s_comment", StringType(), True), + ] + ), + f"{data_path}/supplier.tbl", ), "partsupp": ( - StructType([ - StructField("ps_partkey", IntegerType(), True), - StructField("ps_suppkey", IntegerType(), True), - StructField("ps_availqty", IntegerType(), True), - StructField("ps_supplycost", DecimalType(15, 2), True), - StructField("ps_comment", StringType(), True) - ]), - f"{data_path}/partsupp.tbl" - ) + StructType( + [ + StructField("ps_partkey", IntegerType(), True), + StructField("ps_suppkey", IntegerType(), True), + StructField("ps_availqty", IntegerType(), True), + StructField("ps_supplycost", DecimalType(15, 2), True), + StructField("ps_comment", StringType(), True), + ] + ), + f"{data_path}/partsupp.tbl", + ), } for table_name, (schema, file_path) in tables.items(): full_table_name = f"iceberg.tpch.{table_name}" - #spark.sql(f"DROP TABLE IF EXISTS {full_table_name}") + # spark.sql(f"DROP TABLE IF EXISTS {full_table_name}") create_table = f""" CREATE OR REPLACE TABLE {full_table_name} ( diff --git a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result index ca587b0b1f44..167a7a804c2e 100755 --- a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result +++ b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result @@ -33,6 +33,9 @@ 34 44 <<<< +>>>> select count() from test_delta where p0 = 10 and p2 = 12; +2 +<<<< >>>> select c5, p4 from test_delta where c1 - p0 = 11 order by c5; 25 24 <<<< diff --git a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh index 91f54f927a16..08cd14f64da2 100755 --- a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh +++ b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh @@ -20,6 +20,8 @@ query "select p4 from test_delta where p2 = 12 order by p4;" query "select c1 from test_delta where p4 > 20 order by c1;" query "select p4 from test_delta where c1 > 20 order by p4;" +## explain works +query "select count() from test_delta where p0 = 10 and p2 = 12;" query "select c5, p4 from test_delta where c1 - p0 = 11 order by c5;"