I am using Ray Tune on a Google Colab TPU large instance. With a small dataset it runs. But with my full size dataset (300 MB), I am getting the fatal error pasted below. The data is stored as a parquet on Google Drive and makes its way into tune.run via a "partial"ing of my training function and a Pandas dataframe.
I cannot tell if there is some setting that I can configure to allow a larger “message” in the grpc module.
Python 3.7
ray[tune] 1.9.0.
Any insight is appreciated.
Thanks,
/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, queue_trials, loggers, _remote)
442 export_formats=export_formats,
443 max_failures=max_failures,
--> 444 restore=restore)
445 else:
446 logger.debug("Ignoring some parameters passed into tune.run.")
/usr/local/lib/python3.7/dist-packages/ray/tune/experiment.py in __init__(self, name, run, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, sync_config, trial_name_creator, trial_dirname_creator, log_to_file, checkpoint_freq, checkpoint_at_end, keep_checkpoints_num, checkpoint_score_attr, export_formats, max_failures, restore)
111 "checkpointable function. You can specify checkpoints "
112 "within your trainable function.")
--> 113 self._run_identifier = Experiment.register_if_needed(run)
114 self.name = name or self._run_identifier
/usr/local/lib/python3.7/dist-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
256 "No name detected on trainable. Using {}.".format(name))
257 try:
--> 258 register_trainable(name, run_object)
259 except (TypeError, PicklingError) as e:
260 extra_msg = ("Other options: "
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in register_trainable(name, trainable, warn)
74 raise TypeError("Second argument must be convertable to Trainable",
75 trainable)
---> 76 _global_registry.register(TRAINABLE_CLASS, name, trainable)
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in register(self, category, key, value)
150 self._to_flush[(category, key)] = pickle.dumps_debug(value)
151 if _internal_kv_initialized():
--> 152 self.flush_values()
154 def contains(self, category, key):
/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in flush_values(self)
173 for (category, key), value in self._to_flush.items():
174 _internal_kv_put(
--> 175 _make_key(self._prefix, category, key), value, overwrite=True)
176 self._to_flush.clear()
/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
103 if func.__name__ != "init" or is_client_mode_enabled_by_default:
104 return getattr(ray, func.__name__)(*args, **kwargs)
--> 105 return func(*args, **kwargs)
107 return wrapper
/usr/local/lib/python3.7/dist-packages/ray/experimental/internal_kv.py in _internal_kv_put(key, value, overwrite, namespace)
77 overwrite, bool)
78 return global_gcs_client.internal_kv_put(key, value, overwrite,
---> 79 namespace) == 0
/usr/local/lib/python3.7/dist-packages/ray/_private/gcs_utils.py in wrapper(self, *args, **kwargs)
128 while True:
129 try:
--> 130 return f(self, *args, **kwargs)
131 except grpc.RpcError as e:
132 if remaining_retry <= 0:
/usr/local/lib/python3.7/dist-packages/ray/_private/gcs_utils.py in internal_kv_put(self, key, value, overwrite, namespace)
247 req = gcs_service_pb2.InternalKVPutRequest(
248 key=key, value=value, overwrite=overwrite)
--> 249 reply = self._kv_stub.InternalKVPut(req)
250 if reply.status.code == GcsCode.OK:
251 return reply.added_num
/usr/local/lib/python3.7/dist-packages/grpc/_channel.py in __call__(self, request, timeout, metadata, credentials, wait_for_ready, compression)
944 state, call, = self._blocking(request, timeout, metadata, credentials,
945 wait_for_ready, compression)
--> 946 return _end_unary_response_blocking(state, call, False, None)
948 def with_call(self,
/usr/local/lib/python3.7/dist-packages/grpc/_channel.py in _end_unary_response_blocking(state, call, with_call, deadline)
847 return state.response
848 else:
--> 849 raise _InactiveRpcError(state)
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Received message larger than max (355752448 vs. 104857600)"
debug_error_string = "{"created":"@1638857019.251890494","description":"Error received from peer ipv4:172.28.0.2:42493","file":"src/core/lib/surface/call.cc","file_line":1063,"grpc_message":"Received message larger than max (355752448 vs. 104857600)","grpc_status":8}"
Hi @tenderfoot, it’s hard to tell what is happening without a code example. However, since you mentioned that you’re partial'ing your data into the trainable, maybe try using tune.with_parameters instead: Training (tune.Trainable, tune.report) — Ray v1.9.0
tune.with_parameters is basically the equivalent of functools.partial, with the difference that the arguments are persisted into the Ray object store. This is beneficial for datasets, as only one copy is stored in the object store - with partial you’ll likely run into the problem that your data is serialized a large number of times, hence leading to resource exhaustion.
Let me know if that helps!
Hi, there, I have tried to put my dataset ref into tune.with_parameters instead: [Training (tune.Trainable, tune.report) — Ray v1.9.0 ]. However, the program gets stuck without info.
THe “htop” tells me the virtual memory usage of the process is about 80GB. I used ray tune to do the knowledge distillation with huggingface transformers, the transformers has to produce the features first before putting into training which is super large in my case.
# from tune_sklearn import TuneSearchCV
from ray import tune
import sys
sys.stdout.fileno = lambda: False
# Other imports
import scipy
from ray import tune
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
import lightgbm as lgb
# from tune_sklearn import TuneSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from ray.tune.integration.mlflow import MLflowLoggerCallback
X, y = make_classification(n_samples=600000, n_features=120, n_redundant=0, n_classes=2, class_sep=2.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=123)
X_test, X_val, y_test, y_val \
= train_test_split(X_test, y_test, test_size=0.5, random_state=123)
def train_breast_cancer(config):
train_set = lgb.Dataset(X_train, label=y_train)
test_set = lgb.Dataset(X_val, label=y_val)
gbm = lgb.train(
config,
train_set,
valid_sets=[test_set],
valid_names=["eval"],
verbose_eval=False,
callbacks=[
TuneReportCheckpointCallback({
"binary_error": "eval-binary_error",
"binary_logloss": "eval-binary_logloss"
preds = gbm.predict(X_test)
pred_labels = np.rint(preds)
tune.report(
mean_accuracy=sklearn.metrics.accuracy_score(y_test, pred_labels),
done=True)
num_actors = 2
num_cpus_per_actor = 70
ray_params = RayParams(
num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)
config = {
"objective": "binary",
"metric": ["binary_error", "binary_logloss"],
"verbose": -1,
"boosting_type": tune.grid_search(["gbdt", "dart"]),
"num_leaves": tune.randint(10, 1000),
"learning_rate": tune.loguniform(1e-8, 1e-1)
analysis = tune.run(
tune.with_parameters(train_breast_cancer),
metric="binary_error",
mode="min",
config=config,
num_samples=40,
resources_per_trial=ray_params.get_tune_resources())
print("Best hyperparameters found were: ", analysis.best_config)
ERROR:
<_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Sent message larger than max (580804126 vs. 536870912)"
debug_error_string = "{"created":"@1645562954.671849508","description":"Sent message larger than max (580804126 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":268,"grpc_status":8}"
I am trying to tune lgbm parameters on a dataset of 6million rows. Is there a better way to handle such large datasets?
@xwjiang2010 That worked perfectly, what if I were to read in a spark dataframe when doing this on a real world dataset, do you think I should do the following within the train_breast_cancer?
Read into spark df
convert to pandas dataframe
If I modify the function to do convert a spark df to a pandas within the df I get the following errro:
def train_breast_cancer(config: dict):
df_2 = df.toPandas()
def train_x_y(train_gbm_x):
x_train = train_gbm_x[
train_gbm_x.columns.difference(
["emp_id", "mstr_dt_x", "mstr_dt_y", "total_attrition"]
y_train = train_gbm_x[["total_attrition"]]
return x_train, y_train
# Split into train and test set
X,y = train_x_y(df_2)
train_x, test_x, train_y, test_y = train_test_split(
X, y, test_size=0.25)
# Build input matrices for XGBoost
train_set = xgb.DMatrix(train_x, label=train_y)
test_set = xgb.DMatrix(test_x, label=test_y)
# Train the classifier, using the Tune callback
xgb.train(
config,
train_set,
evals=[(test_set, "eval")],
verbose_eval=False,
callbacks=[TuneReportCheckpointCallback(filename="model.xgb")])
error 1:
ray.cloudpickle.dumps(<class 'ray.tune.function_runner.wrap_function.<locals>.ImplicitFunc'>) failed.
To check which non-serializable variables are captured in scope, re-run the ray script with 'RAY_PICKLE_VERBOSE_DEBUG=1'. Other options:
-Try reproducing the issue by calling `pickle.dumps(trainable)`.
-If the error is typing-related, try removing the type annotations and try again.
if I move the ‘df_2 = df.toPandas()’ outside the function, then it gives the following error:
<_InactiveRpcError of RPC that terminated with:
status = StatusCode.INTERNAL
details = "Exception serializing request!"
debug_error_string = "None"
Great question. Ray Dataset is designed exactly for that.
It solves the last mile problem and serves as a connector between ETL and Ray ML.
Take a look here: Datasets: Distributed Data Loading and Compute — Ray v1.10.0
It offers easy conversation from a Spark DF.
Hi All,
I had the similar issue when using tune.run, ray has already been updated to 1.12 and my data was only 850mbs. Any help will be appreciated.
Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
register_trainable(name, run_object)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
self.flush_values()
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
_internal_kv_put(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
return f(self, *args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
reply = self._kv_stub.InternalKVPut(req)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Sent message larger than max (1789998348 vs. 536870912)"
debug_error_string = "{"created":"@1651587413.094226455","description":"Sent message larger than max (1789998348 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-5-febb8a874c56>", line 1, in <module>
full = XGBoostModel(**params1 | bb).hyper_tune_model()
File "<ipython-input-3-b6b4eff61339>", line 128, in hyper_tune_model
analysis = tune.run(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
experiments[i] = Experiment(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
raise TuneError(
ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store.
Original exception: Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
register_trainable(name, run_object)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
self.flush_values()
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
_internal_kv_put(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
return f(self, *args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
reply = self._kv_stub.InternalKVPut(req)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Sent message larger than max (1789998348 vs. 536870912)"
debug_error_string = "{"created":"@1651587413.094226455","description":"Sent message larger than max (1789998348 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
The reason is self.x_train etc are probably big and they are captured implicitly in your train_xgb_cls_model_single.
Could you restructure the code so that they are contained in the train function itself?
You can try
def train_func(data):
and then
tune.with_parameter(train_func, data) # this will automatically put data into object store without you having to use `ray.put` explicitly.
I have changed the code in train_xgb_cls_model_single by addind a parameter call “data”, which will be a list to contain x_train, y_train, _x_test and y_test, but it seems that I am still getting the same error for tune.run. Am I doing something wrong?
Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
register_trainable(name, run_object)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
self.flush_values()
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
_internal_kv_put(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
return f(self, *args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
reply = self._kv_stub.InternalKVPut(req)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Sent message larger than max (2091700267 vs. 1073741824)"
debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-5-febb8a874c56>", line 1, in <module>
full = XGBoostModel(**params1 | bb).hyper_tune_model()
File "<ipython-input-4-5fa792b9aef4>", line 124, in hyper_tune_model
analysis = tune.run(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
experiments[i] = Experiment(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
raise TuneError(
ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store.
Original exception: Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
register_trainable(name, run_object)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
self.flush_values()
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
_internal_kv_put(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
return f(self, *args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
reply = self._kv_stub.InternalKVPut(req)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Sent message larger than max (2091700267 vs. 1073741824)"
debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
I have changed the code for “train_xgb_cls_model_single”, basically added a parameter called data which is a list contains x_train, y_train, x_test and y_test. However, I am still getting the same error. Have I done anything wrong?
Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
register_trainable(name, run_object)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
self.flush_values()
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
_internal_kv_put(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
return f(self, *args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
reply = self._kv_stub.InternalKVPut(req)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Sent message larger than max (2091700267 vs. 1073741824)"
debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-5-febb8a874c56>", line 1, in <module>
full = XGBoostModel(**params1 | bb).hyper_tune_model()
File "<ipython-input-4-5fa792b9aef4>", line 124, in hyper_tune_model
analysis = tune.run(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
experiments[i] = Experiment(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
raise TuneError(
ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store.
Original exception: Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
register_trainable(name, run_object)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
self.flush_values()
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
_internal_kv_put(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
return f(self, *args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
reply = self._kv_stub.InternalKVPut(req)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = "Sent message larger than max (2091700267 vs. 1073741824)"
debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
Updated, I run the code again and I think I got the same error as naadvar got.
error
Traceback (most recent call last):
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-44929bcce122>", line 1434, in <module>
full = XGBoostModel(**params1 | bb).create_tuned_base_model()
File "<ipython-input-2-44929bcce122>", line 825, in create_tuned_base_model
best_params, best_result = self.hyper_tune_model()
File "<ipython-input-2-44929bcce122>", line 804, in hyper_tune_model
analysis = tune.run(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
experiments[i] = Experiment(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 176, in __init__
raise e
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
register_trainable(name, run_object)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
self.flush_values()
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
_internal_kv_put(
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
return f(self, *args, **kwargs)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
reply = self._kv_stub.InternalKVPut(req)
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 944, in __call__
state, call, = self._blocking(request, timeout, metadata, credentials,
File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 924, in _blocking
raise rendezvous # pylint: disable-msg=raising-bad-type
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.INTERNAL
details = "Exception serializing request!"
debug_error_string = "None"
I think the problem lies in the fact that your train_func is a method of a class instead of a standalone function.
Consider the following examples:
====== using a standalone function =====
Compare
data = np.random.rand(10000, 10000)
def func(config, data):
print(data.size)
tune.run(tune.with_parameters(func, data=data)) # this works fine
data = np.random.rand(10000, 10000)
def func(config):
print(data.size)
tune.run(func) # not working as `data` is captured implicitly
======== using a class method ==========
class MyTuneJob:
def __init__(self):
self.data = np.random.rand(10000, 10000)
def func(self, config, data):
print(data.size)
def hyper_tune_model(self):
tune.run(tune.with_parameters(self.func, data=self.data)) # failing as `self` is still captured.
class MyTuneJob:
def __init__(self):
self.data = np.random.rand(10000, 10000)
def func(self, config):
print(self.data.size)
def hyper_tune_model(self):
tune.run(self.func) # failing not surprisingly...
Long story short, you should try moving out of the class.
Hi all, I try to pass the data into my tuner file, but I still receive two errors:
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = “Sent message larger than max (845011379 vs. 536870912)”
debug_error_string = “UNKNOWN:Error received from peer 10.178.134.118:60685 {created_time:“2023-03-30T17:58:36.265450971-04:00”, grpc_status:8, grpc_message:“Sent message larger than max (845011379 vs. 536870912)”}”
During handling of the above exception, another exception occurred:
TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store.
Original exception: Traceback (most recent call last):
I try to delete all the information without model training, and I got the storage informaiton. Can we increase its upper bound? Thanks a lot
Interesting! I just addressed this problem.
My condition is like: I call another helper funciton in my train function, and at that function I used some large data. Althought I have passed the data via the variables of train(), I still need to consider the parameter searching of my helper function. If I move the helper function inner my training function, I can run the tunner.
Hi I also had same error, and I wonder is it possible to use tune.with_parameters just because I had problems with dataset size, but not necessarily because I want to tune hyperparams ?
for quick PoC if I can with large dataset, then I should also be able to do with smaller dataset using tune.with_parameters so I try to apply that to fashion mnist dataset, base on this, I modify the data (truncated a bit)
import argparse
from typing import Dict
from ray.air import session
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import ray.train as train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig
from ray.tune import Tuner
from ray import tune
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="~/data",
train=True,
download=True,
transform=ToTensor(),
testing_data = datasets.FashionMNIST(...
# exactly same as docs
# Define model
class NeuralNetwork(nn.Module):
# exactly same as docs
def train_epoch(dataloader, model, loss_fn, optimizer):
# exactly same as docs
def validate_epoch(dataloader, model, loss_fn):
# exactly same as docs
def train_func_tune(config, train_data, test_data):
batch_size = config["batch_size"]
lr = config["lr"]
epochs = config["epochs"]
worker_batch_size = batch_size // session.get_world_size()
# Create data loaders.
train_dataloader = DataLoader(train_data, batch_size=worker_batch_size)
test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)
train_dataloader = train.torch.prepare_data_loader(train_dataloader)
test_dataloader = train.torch.prepare_data_loader(test_dataloader)
# Create model.
model = NeuralNetwork()
model = train.torch.prepare_model(model)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
for _ in range(epochs):
train_epoch(train_dataloader, model, loss_fn, optimizer)
loss = validate_epoch(test_dataloader, model, loss_fn)
session.report(dict(loss=loss))
if __name__ == "__main__":
tuner = Tuner(
tune.with_parameters(train_func_tune,
config=config,
train_data=training_data, test_data=test_data)
results = tuner.fit()
results
Then it only says trials did not complete, it doesn’t specify which part is causing it
Screen Shot 2023-04-10 at 19.56.492044×796 98.6 KB
any help is appreciated
update: following some suggestion I cd into ray_results dir and got this error.txt, why it complains about my config having multiple argument?
Failure # 1 (occurred at 2023-04-10_10-46-11)
ray::ImplicitFunc.train() (pid=863, ip=10.42.59.67, repr=train_func_tune)
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 368, in train
raise skipped from exception_cause(skipped)
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
return self._trainable_func(
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
output = fn()
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 406, in _inner
return inner(config, checkpoint_dir=None)
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 398, in inner
return trainable(config, **fn_kwargs)
TypeError: train_func_tune() got multiple values for argument 'config'