I am trying to tune the hyper parameters of my pytorch model using Optuna, but everytime I run the optimizer it gives the following error.
[W 2024-02-05 17:19:26,007] Trial 2 failed with parameters: {'hidden_state': 64, 'droup_out_prec': 0.18615371906093597, 'num_epochs': 14, 'encoder_lr': 0.021112576066074633, 'decoder_lr': 0.0006833950215216012, 'learning_rate': 1.9257784640609453e-05, 'control_factor_ce': 0.03524950489764759, 'control_factor_kl': 0.13410725114961825, 'batch_size': 256} because of the following error: RuntimeError('one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!').
Traceback (most recent call last):
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
value_or_values = func(trial)
File "/root/.ipykernel/38222/command-4071806974828746-3917727534", line 149, in __call__
output = train_model(model = model_to_train,
File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 386, in train_model
raise e
File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
loss.backward()
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
[W 2024-02-05 17:19:26,018] Trial 2 failed with value None.
backword success: 0
Traceback (most recent call last):
File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
loss.backward()
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
The model architecture is
class EncoderLSTM(nn.Module):
def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,has_channel,bidirectional = False):
super(EncoderLSTM, self).__init__()
self.hidden_size_lstm = hidden_size_lstm
self.num_layers_lstm = num_layers_lstm
self.feature_num = feature_num
self.bias = bias
self.has_channel = has_channel
self.bidirectional = bidirectional
self.lstm1 = nn.LSTM(input_size = self.feature_num, hidden_size = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)
self.fc_encoder = nn.Linear(self.hidden_size_lstm, self.hidden_size_lstm)
self.fc_encoder.time_distributed = True
def forward(self, x):
if self.has_channel:
x=x.view(x.size(0),x.size(2),x.size(3))
h0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Hidden state
c0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Cell state
out, (hn, cn) = self.lstm1(x, (h0, c0))
out = self.fc_encoder(out)
return out, (hn, cn)
class DecoderLSTM(nn.Module):
def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,output_size,droup_out_prec = 0.2):
super(DecoderLSTM, self).__init__()
self.hidden_size_lstm = hidden_size_lstm
self.num_layers_lstm = num_layers_lstm
self.feature_num = feature_num
self.bias = bias
self.output_size = output_size
self.droup_out_prec = droup_out_prec
self.decoder_net = nn.LSTM(input_size = self.hidden_size_lstm, hidden_size = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)
self.fc_decoder_1 = nn.Linear(self.hidden_size_lstm, int(self.hidden_size_lstm/2))
self.fc_decoder_1.time_distributed = True
self.fc_decoder_2 = nn.Linear(int(self.hidden_size_lstm/2), self.output_size)
self.fc_decoder_2.time_distributed = True
self.relu_decoder_1= nn.ReLU(inplace=False)
self.dropout_decoder_1 = nn.Dropout(self.droup_out_prec)
self.relu_decoder_2= nn.ReLU(inplace=False)
def forward(self, out, hn,cn,MAX_TIMESTEP = 4,target_tensor = None,return_state = False):
out_decoder_list = []
for time in range(MAX_TIMESTEP):
# print(time)
# print("encoder after fc")
# print(output.size())
out, (hn, cn) = self.decoder_net(out, (hn, cn))
# print("Seq2Seq_decoder")
# print(output.size())
out_reg = torch.squeeze(hn, 0).clone()
out_reg = self.fc_decoder_1(out_reg)
out_reg = self.relu_decoder_1(out_reg)
out_reg = self.dropout_decoder_1(out_reg)
out_reg = self.fc_decoder_2(out_reg)
out_decoder_list.append(out_reg)
if target_tensor is not None:
output = target_tensor[time]
out_decoder_list = torch.cat(out_decoder_list, dim=1)
if return_state:
return out_decoder_list,(hn, cn)
else:
return out_decoder_list
class seq2seqModel_indipendent(nn.Module):
def __init__(self,encoder = None,decoder = None, training = True):
super(seq2seqModel_indipendent, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.training = training
def __call__(self,x,MAX_TIMESTEP = 4,return_state = False):
if self.training:
self.encoder.train()
self.decoder.train()
else:
self.encoder.eval()
self.decoder.eval()
# if self.encoder:
out, (hn, cn) = self.encoder(x)
out = self.decoder(out = out, hn = hn,cn = cn,MAX_TIMESTEP = MAX_TIMESTEP,return_state = return_state)
return out
def train(self):
self.training = True
def eval(self):
self.training = False
def state_dict(self):
return (self.encoder.state_dict(),self.decoder.state_dict())
And tranning lops is like
for epoch in range(num_epochs):
count = 0
# print('epoch:',epoch)
if verbos == 3 or verbos == 2:
print('*'*100)
print(f'Running epoch: {epoch}')
if train:
if not model.training:
model.train()
temp_train = []
for i, (inputs, labels_reg) in enumerate(dataloader_train):
inputs = inputs.to(device)
labels_reg = labels_reg.to(device).reshape(-1,4)
# optimizer.zero_grad()
out_reg = model(inputs)
loss = criterion_train(out_reg, labels_reg)
# loss.backward(retain_graph=True)
# print('iter:',i)
loss.backward()
if (i - count) > 0:
print('iter: ',i)
print('epoch:',epoch)
count += 1
if grad_clip is not None:
torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
# optimizer.step()
for opt in optimizer:
opt.step()
for opt in optimizer:
opt.zero_grad()
# for opt in optimizer:
# opt.zero_grad()
loss_list_train.append(loss.item())
temp_train.append(loss.item())
if verbos == 3:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch, num_epochs, i, len(dataset_train)//batch_size, loss.item()))
loss_list_train_epoch.append(sum(temp_train)/len(temp_train))
if scheduler is not None and verbos == 3:
for schedule,opt in zip(scheduler,optimizer):
before_lr = opt.param_groups[0]["lr"]
schedule.step()
after_lr = opt.param_groups[0]["lr"]
print("Epoch %d: lr %.6f -> %.6f" % (epoch, before_lr, after_lr))
Loss function used is
class CE_prob_KL_loss_function(nn.Module):
def __init__(self,main_loss = MeanAbsoluteError(),control_factor_ce = 1e-2,control_factor_kl = 1e-2,check = False):
super().__init__()
self.main_loss = main_loss
self.control_factor_ce = control_factor_ce
self.control_factor_kl = control_factor_kl
self.check = check
def forward(self,predicted, original,MAX_TIMESTEP = 4):
orig_mean = torch.mean(original, dim=1).view(-1,1)
orig_std = torch.std(original, dim=1).view(-1,1)
# Setting stdev to 1 for dist that have less than 1 stdev in orignal
orig_std = torch.where(orig_std <1, 1, orig_std)
# orig_std_temp = orig_std + 2
orig_std_temp = torch.add(orig_std, 2)
p = torch.distributions.Normal(orig_mean,orig_std_temp)
log_loss = 0
for time in range(MAX_TIMESTEP):
# We compute the probablity of predictions in the orignal distribution
# below line prints the log of probablity of predicted[:,time]
# print('log prob of p:',p.log_prob(predicted[:,time]))
pz_mean = torch.exp(p.log_prob(predicted[:,time]))
# below line prints the probablity of predicted[:,time]
# print('pz_mean: ',pz_mean)
# We compute the probablity of actuals in the orignal distribution
qz_mean = torch.exp(p.log_prob(original[:,time]))
# print('pz_mean:', pz_mean)
# print('qz_mean:',qz_mean)
#We multiply -1 to the probablity to the left of mean as distributions are semetric and mean +- stdev will have the same prob. This will help distinguish between these probs
original = torch.where(original[:,time]>=orig_mean, pz_mean, torch.mul(torch.tensor(-1),pz_mean))
predicted = torch.where(predicted[:,time]>=orig_mean, qz_mean, torch.mul(torch.tensor(-1),qz_mean))
# We compute the diffrence in the probablity of predicted and orignal in the distribution defined by orignal
mk = torch.abs(pz_mean-qz_mean)
# We cut off the values to 1 as we are dealing with probablities
mk = torch.where(mk>1, 1, mk)
#This is where we compute cross entropy.This step means across all batches for a timestep
log_loss += torch.mul( torch.mean(torch.log(1-mk)), -1)
loss_acc = torch.mean(log_loss)
if self.check:
print("loss_acc: ",loss_acc)
var_lable = torch.var(original, dim= 1).mean()
var_pred = torch.var(predicted, dim= 1).mean()
mean_lable = torch.mean(original, dim=1).mean()
mean_pred = torch.mean(predicted, dim=1 ).mean()
# Introduced to avoide division by 0 when var_pred is 0
var_pred = torch.add(var_pred,1e-8)
# used earlier
kl_temp = (torch.pow(mean_lable - mean_pred, 2) / var_pred + var_lable / var_pred - 1.0 - torch.log(var_lable) + torch.log(var_pred))#.mean()
# kl_temp = torch.add(torch.sub(torch.sub(torch.add(torch.div(torch.pow(torch.sub(mean_lable,mean_pred), 2),var_pred) , torch.div(var_lable,var_pred)),1),torch.log(var_lable)),torch.log(var_pred))
# print("kl_temp not defined")
# self.kl_loss( input = torch.log())
# KL_loss = 0.5 * torch.sum(kl_temp)
KL_loss = torch.mul(torch.sum(kl_temp), 0.5)
if self.main_loss is None:
return loss_acc
else:
if self.check:
print("ce_loss: ",(self.control_factor_ce*loss_acc)) #
print("kl_loss: ",(self.control_factor_kl*KL_loss) ) #
# print(predicted[0].size())
if self.check:
print("main_loss: ",self.main_loss(predicted, original))
return self.main_loss(predicted, original) + torch.mul(self.control_factor_ce,loss_acc) + torch.mul(self.control_factor_kl,KL_loss)
Update
Replace
log_loss += torch.mul(...)withlog_loss = log_loss + torch.mul(...).They do the same thing but in a different way. The original code uses an in-place operation
+=, but this interferes with gradient tracking and causestorchto error. The modified line does not use an in-place operation, so it averts that problem.The error is saying that you are not modifying a tensor correctly somewhere. The error condition occurs in
train_model()when you callloss.backward(). Check how you computelossand don't use in-place operations for it - that may resolve the problem based on the information in the message.It would be helpful to see more code, as the part that's erroring doesn't seem to be included in your original post.