ray-project · hongchaodeng · Oct 17, 2024
@@ -4,7 +4,6 @@
 import pandas as pd
 
 from ray.data import Dataset
-from ray.data._internal.aggregate import Max, Min
 from ray.data.preprocessor import Preprocessor
 from ray.util.annotations import PublicAPI
 
@@ -256,18 +255,41 @@ def __init__(
     def _fit(self, dataset: Dataset) -> Preprocessor:
         self._validate_on_fit()
         stats = {}
-        aggregates = []
         if isinstance(self.bins, dict):
             columns = self.bins.keys()
         else:
             columns = self.columns
-
         for column in columns:
-            aggregates.extend(
-                self._fit_uniform_covert_bin_to_aggregate_if_needed(column)
+            bins = self.bins[column] if isinstance(self.bins, dict) else self.bins
+            if not isinstance(bins, int):
+                raise TypeError(
+                    f"`bins` must be an integer or a dict of integers, got {bins}"
+                )
+
+        def _compute_stats_batch(batch: pd.DataFrame):
+            return pd.DataFrame(
+                {
+                    col: [batch[col].abs().min(), batch[col].abs().max()]
+                    for col in columns
+                }
+            )
+
+        # Apply map_batches with pandas format
+        batch_results = dataset.map_batches(
+            _compute_stats_batch, batch_format="pandas"
+        ).to_pandas()
+        # Aggregate results in driver
+        aggregated_results = _compute_stats_batch(batch_results).to_dict()
+
+        aggregate_stats = {
+            key: value
+            for col, row in aggregated_results.items()
+            for key, value in (
+                (f"min({col})", row[0]),
+                (f"max({col})", row[1]),
             )
+        }
 
-        aggregate_stats = dataset.aggregate(*aggregates)
         mins = {}
         maxes = {}
         for key, value in aggregate_stats.items():
@@ -289,15 +311,6 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
     def _validate_on_fit(self):
         self._validate_bins_columns()
 
-    def _fit_uniform_covert_bin_to_aggregate_if_needed(self, column: str):
-        bins = self.bins[column] if isinstance(self.bins, dict) else self.bins
-        if isinstance(bins, int):
-            return (Min(column), Max(column))
-        else:
-            raise TypeError(
-                f"`bins` must be an integer or a dict of integers, got {bins}"
-            )
-
 
 # Copied from
 # https://github.com/pandas-dev/pandas/blob/v1.4.4/pandas/core/reshape/tile.py#L257

@@ -4,7 +4,7 @@
 import pandas as pd
 
 from ray.data import Dataset
-from ray.data._internal.aggregate import AbsMax, Max, Mean, Min, Std
+from ray.data._internal.aggregate import Mean, Std
 from ray.data.preprocessor import Preprocessor
 from ray.util.annotations import PublicAPI
 
@@ -152,8 +152,30 @@ def __init__(self, columns: List[str]):
         self.columns = columns
 
     def _fit(self, dataset: Dataset) -> Preprocessor:
-        aggregates = [Agg(col) for Agg in [Min, Max] for col in self.columns]
-        self.stats_ = dataset.aggregate(*aggregates)
+        columns = self.columns
+
+        def _compute_stats_batch(batch: pd.DataFrame):
+            return pd.DataFrame(
+                {
+                    col: [batch[col].abs().min(), batch[col].abs().max()]
+                    for col in columns
+                }
+            )
+
+        # Apply map_batches with pandas format
+        batch_results = dataset.map_batches(
+            _compute_stats_batch, batch_format="pandas"
+        ).to_pandas()
+        # Aggregate results in driver
+        aggregated_results = _compute_stats_batch(batch_results).to_dict()
+        self.stats_ = {
+            key: value
+            for col, row in aggregated_results.items()
+            for key, value in (
+                (f"min({col})", row[0]),
+                (f"max({col})", row[1]),
+            )
+        }
         return self
 
     def _transform_pandas(self, df: pd.DataFrame):
@@ -232,8 +254,20 @@ def __init__(self, columns: List[str]):
         self.columns = columns
 
     def _fit(self, dataset: Dataset) -> Preprocessor:
-        aggregates = [AbsMax(col) for col in self.columns]
-        self.stats_ = dataset.aggregate(*aggregates)
+        columns = self.columns
+
+        def _compute_stats_batch(batch: pd.DataFrame):
+            return pd.DataFrame({col: [batch[col].abs().max()] for col in columns})
+
+        # Apply map_batches with pandas format
+        batch_results = dataset.map_batches(
+            _compute_stats_batch, batch_format="pandas"
+        ).to_pandas()
+        # Aggregate results in driver
+        aggregated_results = _compute_stats_batch(batch_results).to_dict()
+        self.stats_ = {
+            f"abs_max({col})": row[0] for col, row in aggregated_results.items()
+        }
         return self
 
     def _transform_pandas(self, df: pd.DataFrame):