添加链接
link管理
链接快照平台
  • 输入网页链接,自动生成快照
  • 标签化管理网页链接

I am using Ray Tune on a Google Colab TPU large instance. With a small dataset it runs. But with my full size dataset (300 MB), I am getting the fatal error pasted below. The data is stored as a parquet on Google Drive and makes its way into tune.run via a "partial"ing of my training function and a Pandas dataframe.
I cannot tell if there is some setting that I can configure to allow a larger “message” in the grpc module.

Python 3.7
ray[tune] 1.9.0.

Any insight is appreciated.
Thanks,

/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, queue_trials, loggers, _remote)
    442                 export_formats=export_formats,
    443                 max_failures=max_failures,
--> 444                 restore=restore)
    445     else:
    446         logger.debug("Ignoring some parameters passed into tune.run.")
/usr/local/lib/python3.7/dist-packages/ray/tune/experiment.py in __init__(self, name, run, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, sync_config, trial_name_creator, trial_dirname_creator, log_to_file, checkpoint_freq, checkpoint_at_end, keep_checkpoints_num, checkpoint_score_attr, export_formats, max_failures, restore)
    111                     "checkpointable function. You can specify checkpoints "
    112                     "within your trainable function.")
--> 113         self._run_identifier = Experiment.register_if_needed(run)
    114         self.name = name or self._run_identifier
/usr/local/lib/python3.7/dist-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
    256                     "No name detected on trainable. Using {}.".format(name))
    257             try:
--> 258                 register_trainable(name, run_object)
    259             except (TypeError, PicklingError) as e:
    260                 extra_msg = ("Other options: "
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in register_trainable(name, trainable, warn)
     74         raise TypeError("Second argument must be convertable to Trainable",
     75                         trainable)
---> 76     _global_registry.register(TRAINABLE_CLASS, name, trainable)
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in register(self, category, key, value)
    150         self._to_flush[(category, key)] = pickle.dumps_debug(value)
    151         if _internal_kv_initialized():
--> 152             self.flush_values()
    154     def contains(self, category, key):
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in flush_values(self)
    173         for (category, key), value in self._to_flush.items():
    174             _internal_kv_put(
--> 175                 _make_key(self._prefix, category, key), value, overwrite=True)
    176         self._to_flush.clear()
/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
    103             if func.__name__ != "init" or is_client_mode_enabled_by_default:
    104                 return getattr(ray, func.__name__)(*args, **kwargs)
--> 105         return func(*args, **kwargs)
    107     return wrapper
/usr/local/lib/python3.7/dist-packages/ray/experimental/internal_kv.py in _internal_kv_put(key, value, overwrite, namespace)
     77         overwrite, bool)
     78     return global_gcs_client.internal_kv_put(key, value, overwrite,
---> 79                                              namespace) == 0
/usr/local/lib/python3.7/dist-packages/ray/_private/gcs_utils.py in wrapper(self, *args, **kwargs)
    128         while True:
    129             try:
--> 130                 return f(self, *args, **kwargs)
    131             except grpc.RpcError as e:
    132                 if remaining_retry <= 0:
/usr/local/lib/python3.7/dist-packages/ray/_private/gcs_utils.py in internal_kv_put(self, key, value, overwrite, namespace)
    247         req = gcs_service_pb2.InternalKVPutRequest(
    248             key=key, value=value, overwrite=overwrite)
--> 249         reply = self._kv_stub.InternalKVPut(req)
    250         if reply.status.code == GcsCode.OK:
    251             return reply.added_num
/usr/local/lib/python3.7/dist-packages/grpc/_channel.py in __call__(self, request, timeout, metadata, credentials, wait_for_ready, compression)
    944         state, call, = self._blocking(request, timeout, metadata, credentials,
    945                                       wait_for_ready, compression)
--> 946         return _end_unary_response_blocking(state, call, False, None)
    948     def with_call(self,
/usr/local/lib/python3.7/dist-packages/grpc/_channel.py in _end_unary_response_blocking(state, call, with_call, deadline)
    847             return state.response
    848     else:
--> 849         raise _InactiveRpcError(state)
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Received message larger than max (355752448 vs. 104857600)"
	debug_error_string = "{"created":"@1638857019.251890494","description":"Error received from peer ipv4:172.28.0.2:42493","file":"src/core/lib/surface/call.cc","file_line":1063,"grpc_message":"Received message larger than max (355752448 vs. 104857600)","grpc_status":8}"
              

Hi @tenderfoot, it’s hard to tell what is happening without a code example. However, since you mentioned that you’re partial'ing your data into the trainable, maybe try using tune.with_parameters instead: Training (tune.Trainable, tune.report) — Ray v1.9.0

tune.with_parameters is basically the equivalent of functools.partial, with the difference that the arguments are persisted into the Ray object store. This is beneficial for datasets, as only one copy is stored in the object store - with partial you’ll likely run into the problem that your data is serialized a large number of times, hence leading to resource exhaustion.

Let me know if that helps!

Hi, there, I have tried to put my dataset ref into tune.with_parameters instead: [Training (tune.Trainable, tune.report) — Ray v1.9.0 ]. However, the program gets stuck without info.
THe “htop” tells me the virtual memory usage of the process is about 80GB. I used ray tune to do the knowledge distillation with huggingface transformers, the transformers has to produce the features first before putting into training which is super large in my case.

Do you have any ideas, thank you.

hi,I get same error when use ray tune, my code is

def train_model(config):
    train_set = ray_lgb_build_dataset(train_data)
    val_set = ray_lgb_build_dataset(val_data)
    evals_result = {}
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        valid_sets=[val_set],
        valid_names=['val'],
        ray_params=ray_params
bayesopt = BayesOptSearch(metric='l2', mode='min')
hy_space = {
    'max_iter': tune.randint(64, 128),
    'max_depth': tune.randint(4, 9),
analysis = tune.run(
    tune.with_parameters(train_model),
    config=hy_space,
    search_alg=bayesopt,
    resources_per_trial=ray_params.get_tune_resources()
python 3.8.5
ray 1.9.1

when I use ray to train a model using default params, it works, but when tune ,get this error, my data is bigger than million.

Hi all,

I get the same error on a data bricks cluster with the following config

worker type: r5.24xlarge , 96 cores, 786 gb
driver type: r5.24xlarge , 96 cores, 786 gb
max worker : 4

I am running the following code snippet:

# from tune_sklearn import TuneSearchCV
from ray import tune
import sys
sys.stdout.fileno = lambda: False
# Other imports
import scipy
from ray import tune
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
import lightgbm as lgb
# from tune_sklearn import TuneSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from ray.tune.integration.mlflow import MLflowLoggerCallback
X, y = make_classification(n_samples=600000, n_features=120, n_redundant=0, n_classes=2, class_sep=2.5)
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.6, random_state=123)
X_test, X_val, y_test, y_val \
    = train_test_split(X_test, y_test, test_size=0.5, random_state=123)
def train_breast_cancer(config):
    train_set = lgb.Dataset(X_train, label=y_train)
    test_set = lgb.Dataset(X_val, label=y_val)
    gbm = lgb.train(
        config,
        train_set,
        valid_sets=[test_set],
        valid_names=["eval"],
        verbose_eval=False,
        callbacks=[
            TuneReportCheckpointCallback({
                "binary_error": "eval-binary_error",
                "binary_logloss": "eval-binary_logloss"
    preds = gbm.predict(X_test)
    pred_labels = np.rint(preds)
    tune.report(
        mean_accuracy=sklearn.metrics.accuracy_score(y_test, pred_labels),
        done=True)
num_actors = 2
num_cpus_per_actor = 70
ray_params = RayParams(
    num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)
config = {
    "objective": "binary",
    "metric": ["binary_error", "binary_logloss"],
    "verbose": -1,
    "boosting_type": tune.grid_search(["gbdt", "dart"]),
    "num_leaves": tune.randint(10, 1000),
    "learning_rate": tune.loguniform(1e-8, 1e-1)
analysis = tune.run(
    tune.with_parameters(train_breast_cancer),
    metric="binary_error",
    mode="min",
    config=config,
    num_samples=40,
    resources_per_trial=ray_params.get_tune_resources())
print("Best hyperparameters found were: ", analysis.best_config)

ERROR:

<_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (580804126 vs. 536870912)"
	debug_error_string = "{"created":"@1645562954.671849508","description":"Sent message larger than max (580804126 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":268,"grpc_status":8}"

I am trying to tune lgbm parameters on a dataset of 6million rows. Is there a better way to handle such large datasets?

@xwjiang2010 That worked perfectly, what if I were to read in a spark dataframe when doing this on a real world dataset, do you think I should do the following within the train_breast_cancer?

  • Read into spark df
  • convert to pandas dataframe
  • If I modify the function to do convert a spark df to a pandas within the df I get the following errro:

    def train_breast_cancer(config: dict):
        df_2 = df.toPandas()
        def train_x_y(train_gbm_x):
          x_train = train_gbm_x[
              train_gbm_x.columns.difference(
                  ["emp_id", "mstr_dt_x", "mstr_dt_y", "total_attrition"]
          y_train = train_gbm_x[["total_attrition"]]
          return x_train, y_train
        # Split into train and test set
        X,y = train_x_y(df_2)
        train_x, test_x, train_y, test_y = train_test_split(
            X, y, test_size=0.25)
        # Build input matrices for XGBoost
        train_set = xgb.DMatrix(train_x, label=train_y)
        test_set = xgb.DMatrix(test_x, label=test_y)
        # Train the classifier, using the Tune callback
        xgb.train(
            config,
            train_set,
            evals=[(test_set, "eval")],
            verbose_eval=False,
            callbacks=[TuneReportCheckpointCallback(filename="model.xgb")])
    

    error 1:

    ray.cloudpickle.dumps(<class 'ray.tune.function_runner.wrap_function.<locals>.ImplicitFunc'>) failed.
    To check which non-serializable variables are captured in scope, re-run the ray script with 'RAY_PICKLE_VERBOSE_DEBUG=1'. Other options: 
    -Try reproducing the issue by calling `pickle.dumps(trainable)`. 
    -If the error is typing-related, try removing the type annotations and try again.
    

    if I move the ‘df_2 = df.toPandas()’ outside the function, then it gives the following error:

    <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.INTERNAL
    	details = "Exception serializing request!"
    	debug_error_string = "None"
                  

    Great question. Ray Dataset is designed exactly for that.
    It solves the last mile problem and serves as a connector between ETL and Ray ML.
    Take a look here: Datasets: Distributed Data Loading and Compute — Ray v1.10.0
    It offers easy conversation from a Spark DF.

    Hi All,

    I had the similar issue when using tune.run, ray has already been updated to 1.12 and my data was only 850mbs. Any help will be appreciated. :slight_smile:

    Code snippet

    def data_store(self):
            x_train, y_train, x_test, y_test=\
                ray.put(self.x_train),ray.put(self.y_train),\
                ray.put(self.x_test), ray.put(self.x_test)
            return x_train,y_train,x_test, y_test
    def train_xgb_cls_model_single(self, tuning_param=None):
        model = self.make_xgb_cls_model(tuning_param)
        ray_x_train, ray_y_train, ray_x_test, ray_y_test = self.data_store()
        eval_set = [(ray.get(ray_x_train), ray.get(ray_y_train)), (ray.get(ray_x_test), ray.get(ray_y_test))]
        cv_train = xgb.DMatrix(ray.get(ray_x_train), label=ray.get(ray_y_train), missing=None, weight=None,
                                   silent=False,
                                   feature_names=ray.get(ray_x_train).columns, feature_types=None, nthread=-1)
        xgb_cv_error = xgb.cv(dtrain=cv_train, params=model_params, folds=5,
                                  num_boost_round=model_params['n_estimators'], early_stopping_rounds=30, metrics="logloss",
                                  as_pandas=True, seed=101, maximize=False, verbose_eval=3, shuffle=False)
        model.fit(ray.get(ray_x_train), ray.get(ray_y_train), eval_metric="logloss",
                  early_stopping_rounds=30, eval_set=eval_set,
                  verbose=True)
        model_predictions = model.predict(ray.get(ray_x_test))
        logloss = log_loss(ray.get(ray_y_test), model_predictions)
        tune.report(logloss=logloss, done=True)
        return model
        def hyper_tune_model(self):
            mod_params = {
                "n_estimators": tune.randint(20, 512),
                "max_depth": tune.randint(3, 15),
                "min_child_weight": tune.randint(1, 7),
                "subsample": tune.uniform(0.3, 1.0),
                "learning_rate": tune.loguniform(0.000001, 1.0),
                "reg_alpha": tune.uniform(0.1, 5.0),
                "reg_lambda": tune.uniform(0.1, 5.0),
                "num_parallel_tree":tune.randint(20, 500),
                "gamma" :tune.uniform(0.1, 10.0),
                "max_bin":tune.randint(10, 512),
                "colsample_bytree":tune.loguniform(0.5, 1.0),
                "colsample_bylevel": tune.loguniform(0.5, 1.0),
                "max_delta_step":tune.randint(0, 5),
            optuna_opt = OptunaSearch(
                metric="logloss",
                seed=101,
                mode="min")
            analysis = tune.run(
                tune.with_parameters(self.train_xgb_cls_model_single),
                reuse_actors=True,
                metric="logloss",
                checkpoint_at_end=True,
                max_concurrent_trials=None,
                mode="min",
                resources_per_trial={"cpu":64, "gpu": 3},
                config=mod_params,
                search_alg = optuna_opt,
                verbose=1,
                resume="AUTO",
                num_samples=1000)
            best_result, best_params = analysis.best_dataframe, analysis.best_config
            return best_params, best_result
    

    Traceback

    Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
        self._run_identifier = Experiment.register_if_needed(run)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
        register_trainable(name, run_object)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
        _global_registry.register(TRAINABLE_CLASS, name, trainable)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
        self.flush_values()
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
        _internal_kv_put(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
        return func(*args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
        return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
        return f(self, *args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
        reply = self._kv_stub.InternalKVPut(req)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
        return _end_unary_response_blocking(state, call, False, None)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
        raise _InactiveRpcError(state)
    grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.RESOURCE_EXHAUSTED
    	details = "Sent message larger than max (1789998348 vs. 536870912)"
    	debug_error_string = "{"created":"@1651587413.094226455","description":"Sent message larger than max (1789998348 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
    During handling of the above exception, another exception occurred:
    Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
        exec(code_obj, self.user_global_ns, self.user_ns)
      File "<ipython-input-5-febb8a874c56>", line 1, in <module>
        full = XGBoostModel(**params1 | bb).hyper_tune_model()
      File "<ipython-input-3-b6b4eff61339>", line 128, in hyper_tune_model
        analysis = tune.run(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
        experiments[i] = Experiment(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
        raise TuneError(
    ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 
    Original exception: Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
        self._run_identifier = Experiment.register_if_needed(run)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
        register_trainable(name, run_object)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
        _global_registry.register(TRAINABLE_CLASS, name, trainable)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
        self.flush_values()
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
        _internal_kv_put(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
        return func(*args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
        return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
        return f(self, *args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
        reply = self._kv_stub.InternalKVPut(req)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
        return _end_unary_response_blocking(state, call, False, None)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
        raise _InactiveRpcError(state)
    grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.RESOURCE_EXHAUSTED
    	details = "Sent message larger than max (1789998348 vs. 536870912)"
    	debug_error_string = "{"created":"@1651587413.094226455","description":"Sent message larger than max (1789998348 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
                  

    The reason is self.x_train etc are probably big and they are captured implicitly in your train_xgb_cls_model_single.
    Could you restructure the code so that they are contained in the train function itself?
    You can try

    def train_func(data):
    

    and then

    tune.with_parameter(train_func, data)  #  this will automatically put data into object store without you having to use `ray.put` explicitly.
                  

    Thanks @xwjiang2010 .

    I have changed the code in train_xgb_cls_model_single by addind a parameter call “data”, which will be a list to contain x_train, y_train, _x_test and y_test, but it seems that I am still getting the same error for tune.run. Am I doing something wrong?

    Code snippet

    def train_xgb_cls_model_single(self, data, tuning_param=None, cv=False, cv_folds_t=None):
        model = self.make_xgb_cls_model(tuning_param)
        #Fetch train test data from data, which is a list
        x_train, y_train, x_test, y_test = data[0], data[1], data[2], data[3]
        eval_set = [(x_train, y_train), (x_test, y_test)]
        model_params = model.get_params()
        cv_train = xgb.DMatrix(x_train, label=y_train, missing=None, weight=None, silent=False,
                               feature_names=x_train.columns, feature_types=None, nthread=-1)
        xgb_cv_error = xgb.cv(dtrain=cv_train, params=model_params, folds=5,
                              num_boost_round=model_params['n_estimators'], early_stopping_rounds=30, metrics="logloss",
                              as_pandas=True, seed=101, maximize=False, verbose_eval=3, shuffle=False)
        model.fit(x_train, y_train, eval_metric="logloss",
                  early_stopping_rounds=30, eval_set=eval_set,
                  verbose=True)
        model_predictions = model.predict(x_test)
        logloss = log_loss(y_test, model_predictions)
        tune.report(logloss=logloss, done=True)
        return model
    def hyper_tune_model(self):
        mod_params = {
            "n_estimators": tune.randint(20, 512),
            "max_depth": tune.randint(3, 15),
            "min_child_weight": tune.randint(1, 7),
            "subsample": tune.uniform(0.3, 1.0),
            "learning_rate": tune.loguniform(0.000001, 1.0),
            "reg_alpha": tune.uniform(0.1, 5.0),
            "reg_lambda": tune.uniform(0.1, 5.0),
            "num_parallel_tree":tune.randint(20, 500),
            "gamma" :tune.uniform(0.1, 10.0),
            "max_bin":tune.randint(10, 512),
            "colsample_bytree":tune.loguniform(0.5, 1.0),
            "colsample_bylevel": tune.loguniform(0.5, 1.0),
            "max_delta_step":tune.randint(0, 5),
        optuna_opt = OptunaSearch(
            metric="logloss",
            seed=101,
            mode="min")
        analysis = tune.run(
            tune.with_parameters(self.train_xgb_cls_model_single,data=[self.x_train,self.y_train,self.x_test,self.y_test]),
            reuse_actors=True,
            metric="logloss",
            checkpoint_at_end=True,
            max_concurrent_trials=None,
            mode="min",
            resources_per_trial={"cpu":64, "gpu": 3},
            config=mod_params,
            search_alg = optuna_opt,
            verbose=1,
            resume="AUTO",
            num_samples=1000)
        best_result, best_params = analysis.best_dataframe, analysis.best_config
        return best_params, best_result
    

    Traceback

    Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
        self._run_identifier = Experiment.register_if_needed(run)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
        register_trainable(name, run_object)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
        _global_registry.register(TRAINABLE_CLASS, name, trainable)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
        self.flush_values()
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
        _internal_kv_put(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
        return func(*args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
        return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
        return f(self, *args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
        reply = self._kv_stub.InternalKVPut(req)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
        return _end_unary_response_blocking(state, call, False, None)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
        raise _InactiveRpcError(state)
    grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.RESOURCE_EXHAUSTED
    	details = "Sent message larger than max (2091700267 vs. 1073741824)"
    	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
    During handling of the above exception, another exception occurred:
    Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
        exec(code_obj, self.user_global_ns, self.user_ns)
      File "<ipython-input-5-febb8a874c56>", line 1, in <module>
        full = XGBoostModel(**params1 | bb).hyper_tune_model()
      File "<ipython-input-4-5fa792b9aef4>", line 124, in hyper_tune_model
        analysis = tune.run(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
        experiments[i] = Experiment(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
        raise TuneError(
    ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 
    Original exception: Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
        self._run_identifier = Experiment.register_if_needed(run)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
        register_trainable(name, run_object)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
        _global_registry.register(TRAINABLE_CLASS, name, trainable)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
        self.flush_values()
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
        _internal_kv_put(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
        return func(*args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
        return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
        return f(self, *args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
        reply = self._kv_stub.InternalKVPut(req)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
        return _end_unary_response_blocking(state, call, False, None)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
        raise _InactiveRpcError(state)
    grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.RESOURCE_EXHAUSTED
    	details = "Sent message larger than max (2091700267 vs. 1073741824)"
    	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
                  

    Thanks @xwjiang2010 .

    I have changed the code for “train_xgb_cls_model_single”, basically added a parameter called data which is a list contains x_train, y_train, x_test and y_test. However, I am still getting the same error. Have I done anything wrong?

    Code snippet

    def train_xgb_cls_model_single(self, data, tuning_param=None, cv=False, cv_folds_t=None):
        model = self.make_xgb_cls_model(tuning_param)
        # Get train test data from data, which is a list
        x_train, y_train, x_test, y_test = data[0], data[1], data[2], data[3]
        eval_set = [(x_train, y_train), (x_test, y_test)]
        model_params = model.get_params()
        cv_train = xgb.DMatrix(x_train, label=y_train, missing=None, weight=None, silent=False,
                               feature_names=x_train.columns, feature_types=None, nthread=-1)
        xgb_cv_error = xgb.cv(dtrain=cv_train, params=model_params, folds=5,
                              num_boost_round=model_params['n_estimators'], early_stopping_rounds=30, metrics="logloss",
                              as_pandas=True, seed=101, maximize=False, verbose_eval=3, shuffle=False)
        model.fit(x_train, y_train, eval_metric="logloss",
                  early_stopping_rounds=30, eval_set=eval_set,
                  verbose=True)
        model_predictions = model.predict(x_test)
        logloss = log_loss(y_test, model_predictions)
        tune.report(logloss=logloss, done=True)
        return model
      def hyper_tune_model(self):
          mod_params = {
              "n_estimators": tune.randint(20, 512),
              "max_depth": tune.randint(3, 15),
              "min_child_weight": tune.randint(1, 7),
              "subsample": tune.uniform(0.3, 1.0),
              "learning_rate": tune.loguniform(0.000001, 1.0),
              "reg_alpha": tune.uniform(0.1, 5.0),
              "reg_lambda": tune.uniform(0.1, 5.0),
              "num_parallel_tree":tune.randint(20, 500),
              "gamma" :tune.uniform(0.1, 10.0),
              "max_bin":tune.randint(10, 512),
              "colsample_bytree":tune.loguniform(0.5, 1.0),
              "colsample_bylevel": tune.loguniform(0.5, 1.0),
              "max_delta_step":tune.randint(0, 5),
          optuna_opt = OptunaSearch(
              metric="logloss",
              seed=101,
              mode="min")
          analysis = tune.run(
              tune.with_parameters(self.train_xgb_cls_model_single,data=[self.x_train,self.y_train,self.x_test,self.y_test]),
              reuse_actors=True,
              metric="logloss",
              checkpoint_at_end=True,
              max_concurrent_trials=None,
              mode="min",
              resources_per_trial={"cpu":64, "gpu": 3},
              config=mod_params,
              search_alg = optuna_opt,
              verbose=1,
              resume="AUTO",
              num_samples=1000)
          best_result, best_params = analysis.best_dataframe, analysis.best_config
          return best_params, best_result
    

    Traceback

    Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
        self._run_identifier = Experiment.register_if_needed(run)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
        register_trainable(name, run_object)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
        _global_registry.register(TRAINABLE_CLASS, name, trainable)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
        self.flush_values()
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
        _internal_kv_put(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
        return func(*args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
        return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
        return f(self, *args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
        reply = self._kv_stub.InternalKVPut(req)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
        return _end_unary_response_blocking(state, call, False, None)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
        raise _InactiveRpcError(state)
    grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.RESOURCE_EXHAUSTED
    	details = "Sent message larger than max (2091700267 vs. 1073741824)"
    	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
    During handling of the above exception, another exception occurred:
    Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
        exec(code_obj, self.user_global_ns, self.user_ns)
      File "<ipython-input-5-febb8a874c56>", line 1, in <module>
        full = XGBoostModel(**params1 | bb).hyper_tune_model()
      File "<ipython-input-4-5fa792b9aef4>", line 124, in hyper_tune_model
        analysis = tune.run(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
        experiments[i] = Experiment(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
        raise TuneError(
    ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 
    Original exception: Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
        self._run_identifier = Experiment.register_if_needed(run)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
        register_trainable(name, run_object)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
        _global_registry.register(TRAINABLE_CLASS, name, trainable)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
        self.flush_values()
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
        _internal_kv_put(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
        return func(*args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
        return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
        return f(self, *args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
        reply = self._kv_stub.InternalKVPut(req)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
        return _end_unary_response_blocking(state, call, False, None)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
        raise _InactiveRpcError(state)
    grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.RESOURCE_EXHAUSTED
    	details = "Sent message larger than max (2091700267 vs. 1073741824)"
    	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
                  

    Updated, I run the code again and I think I got the same error as naadvar got.

    error

    Traceback (most recent call last):
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
        exec(code_obj, self.user_global_ns, self.user_ns)
      File "<ipython-input-2-44929bcce122>", line 1434, in <module>
        full = XGBoostModel(**params1 | bb).create_tuned_base_model()
      File "<ipython-input-2-44929bcce122>", line 825, in create_tuned_base_model
        best_params, best_result = self.hyper_tune_model()
      File "<ipython-input-2-44929bcce122>", line 804, in hyper_tune_model
        analysis = tune.run(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
        experiments[i] = Experiment(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 176, in __init__
        raise e
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
        self._run_identifier = Experiment.register_if_needed(run)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
        register_trainable(name, run_object)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
        _global_registry.register(TRAINABLE_CLASS, name, trainable)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
        self.flush_values()
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
        _internal_kv_put(
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
        return func(*args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
        return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
        return f(self, *args, **kwargs)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
        reply = self._kv_stub.InternalKVPut(req)
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 944, in __call__
        state, call, = self._blocking(request, timeout, metadata, credentials,
      File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 924, in _blocking
        raise rendezvous  # pylint: disable-msg=raising-bad-type
    grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    	status = StatusCode.INTERNAL
    	details = "Exception serializing request!"
    	debug_error_string = "None"
                  

    @tigertimwu Thanks for trying out.

    I think the problem lies in the fact that your train_func is a method of a class instead of a standalone function.

    Consider the following examples:
    ====== using a standalone function =====
    Compare

    data = np.random.rand(10000, 10000)
    def func(config, data):
    	print(data.size)
    tune.run(tune.with_parameters(func, data=data))       # this works fine
    
    data = np.random.rand(10000, 10000)
    def func(config):
    	print(data.size)
    tune.run(func)    # not working as `data` is captured implicitly
    

    ======== using a class method ==========

    class MyTuneJob:
    	def __init__(self):
    		self.data = np.random.rand(10000, 10000)
    	def func(self, config, data):
    		print(data.size)
    	def hyper_tune_model(self):
    		tune.run(tune.with_parameters(self.func, data=self.data))     # failing as `self` is still captured.
    
    class MyTuneJob:
    	def __init__(self):
    		self.data = np.random.rand(10000, 10000)
    	def func(self, config):
    		print(self.data.size)
    	def hyper_tune_model(self):
    		tune.run(self.func)   # failing not surprisingly...
    

    Long story short, you should try moving out of the class.

    Hi all, I try to pass the data into my tuner file, but I still receive two errors:

    _InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    status = StatusCode.RESOURCE_EXHAUSTED
    details = “Sent message larger than max (845011379 vs. 536870912)”
    debug_error_string = “UNKNOWN:Error received from peer 10.178.134.118:60685 {created_time:“2023-03-30T17:58:36.265450971-04:00”, grpc_status:8, grpc_message:“Sent message larger than max (845011379 vs. 536870912)”}”

    During handling of the above exception, another exception occurred:

    TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store.
    Original exception: Traceback (most recent call last):

    I try to delete all the information without model training, and I got the storage informaiton. Can we increase its upper bound? Thanks a lot

    Interesting! I just addressed this problem.

    My condition is like: I call another helper funciton in my train function, and at that function I used some large data. Althought I have passed the data via the variables of train(), I still need to consider the parameter searching of my helper function. If I move the helper function inner my training function, I can run the tunner.

    Hi I also had same error, and I wonder is it possible to use tune.with_parameters just because I had problems with dataset size, but not necessarily because I want to tune hyperparams ?

    for quick PoC if I can with large dataset, then I should also be able to do with smaller dataset using tune.with_parameters so I try to apply that to fashion mnist dataset, base on this, I modify the data (truncated a bit)

    import argparse
    from typing import Dict
    from ray.air import session
    import torch
    from torch import nn
    from torch.utils.data import DataLoader
    from torchvision import datasets
    from torchvision.transforms import ToTensor
    import ray.train as train
    from ray.train.torch import TorchTrainer
    from ray.air.config import ScalingConfig
    from ray.tune import Tuner
    from ray import tune
    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="~/data",
        train=True,
        download=True,
        transform=ToTensor(),
    testing_data = datasets.FashionMNIST(...
       # exactly same as docs
    # Define model
    class NeuralNetwork(nn.Module):
        # exactly same as docs
    def train_epoch(dataloader, model, loss_fn, optimizer):
        # exactly same as docs
    def validate_epoch(dataloader, model, loss_fn):
        # exactly same as docs
    def train_func_tune(config, train_data, test_data):
        batch_size = config["batch_size"]
        lr = config["lr"]
        epochs = config["epochs"]
        worker_batch_size = batch_size // session.get_world_size()
        # Create data loaders.
        train_dataloader = DataLoader(train_data, batch_size=worker_batch_size)
        test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)
        train_dataloader = train.torch.prepare_data_loader(train_dataloader)
        test_dataloader = train.torch.prepare_data_loader(test_dataloader)
        # Create model.
        model = NeuralNetwork()
        model = train.torch.prepare_model(model)
        loss_fn = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
        for _ in range(epochs):
            train_epoch(train_dataloader, model, loss_fn, optimizer)
            loss = validate_epoch(test_dataloader, model, loss_fn)
            session.report(dict(loss=loss))
    if __name__ == "__main__":
        tuner = Tuner(
        tune.with_parameters(train_func_tune, 
            config=config,
            train_data=training_data, test_data=test_data)
    results = tuner.fit()
    results
    

    Then it only says trials did not complete, it doesn’t specify which part is causing it

    Screen Shot 2023-04-10 at 19.56.492044×796 98.6 KB

    any help is appreciated

    update: following some suggestion I cd into ray_results dir and got this error.txt, why it complains about my config having multiple argument?

    Failure # 1 (occurred at 2023-04-10_10-46-11)
    ray::ImplicitFunc.train() (pid=863, ip=10.42.59.67, repr=train_func_tune)
      File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 368, in train
        raise skipped from exception_cause(skipped)
      File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
        return self._trainable_func(
      File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
        output = fn()
      File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 406, in _inner
        return inner(config, checkpoint_dir=None)
      File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 398, in inner
        return trainable(config, **fn_kwargs)
    TypeError: train_func_tune() got multiple values for argument 'config'