move optimizer state from gpu to cpu, the training The loss value first decreases and then increases

28 views Asked by At

I have 8 nvidia gpu, each 80G, I am now train the 70B llama model, but cannot load model and its optimizer states into the gpu, so I move the optimizer states from gpu to cpu, only move the needed states to gpu when update the parameters, like below:

@torch.no_grad()
def step(self, closure: Callable = None):
    """
    Performs a single optimization step.

    Arguments:
        closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
    """
    loss = None
    if closure is not None:
        loss = closure()

    for group in self.param_groups:
        for p in group["params"]:
            if p.grad is None:
                continue
            grad = p.grad
            if grad.is_sparse:
                raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")

            state = self.state[p]

            # State initialization
            if len(state) == 0:
                state["step"] = 0
                # Exponential moving average of gradient values
                state["exp_avg"] = torch.zeros_like(p)
                # Exponential moving average of squared gradient values
                state["exp_avg_sq"] = torch.zeros_like(p)

            exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
            #modified1: only move the states corresponde to the updating parameters to gpu
            if exp_avg.device != p.device:
                exp_avg = exp_avg.to(p.device)
                exp_avg = exp_avg.to(p.dtype)
            if exp_avg_sq.device != p.device:
                exp_avg_sq = exp_avg_sq.to(p.device)
                exp_avg_sq = exp_avg_sq.to(p.dtype)
            beta1, beta2 = group["betas"]

            state["step"] += 1

            # Decay the first and second moment running average coefficient
            # In-place operations to update the averages at the same time
            exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
            denom = exp_avg_sq.sqrt().add_(group["eps"])

            step_size = group["lr"]
            if group["correct_bias"]:  # No bias correction for Bert
                bias_correction1 = 1.0 - beta1 ** state["step"]
                bias_correction2 = 1.0 - beta2 ** state["step"]
                step_size = step_size * math.sqrt(bias_correction2) / bias_correction1

            p.addcdiv_(exp_avg, denom, value=-step_size)
            if group["weight_decay"] > 0.0:
                p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
            #modified2: when updated, the optimizer state will move to cpu to free gpu
            state["exp_avg"] = exp_avg.to('cpu')
            state["exp_avg_sq"] = exp_avg_sq.to('cpu')
    return loss

I have marked the modified lines above as #modified:, the logic is simple, bug the training loss was confusing me:

{'loss': 1.4473, 'learning_rate': 1.9993007047883988e-05, 'epoch': 0.05}
{'loss': 1.3078, 'learning_rate': 1.9972037971811802e-05, 'epoch': 0.09}
{'loss': 1.2186, 'learning_rate': 1.9937122098932428e-05, 'epoch': 0.14}
{'loss': 0.9871, 'learning_rate': 1.9888308262251286e-05, 'epoch': 0.19}
{'loss': 0.9528, 'learning_rate': 1.9825664732332886e-05, 'epoch': 0.23}
{'loss': 0.8264, 'learning_rate': 1.9749279121818235e-05, 'epoch': 0.28}
{'loss': 0.8139, 'learning_rate': 1.9659258262890683e-05, 'epoch': 0.33}
{'loss': 0.802, 'learning_rate': 1.955572805786141e-05, 'epoch': 0.38}
{'loss': 0.7674, 'learning_rate': 1.9438833303083677e-05, 'epoch': 0.42}
{'loss': 0.8438, 'learning_rate': 1.9308737486442045e-05, 'epoch': 0.47}
{'loss': 0.8202, 'learning_rate': 1.9165622558699763e-05, 'epoch': 0.52}
{'loss': 0.8353, 'learning_rate': 1.900968867902419e-05, 'epoch': 0.56}
{'loss': 0.8675, 'learning_rate': 1.8841153935046098e-05, 'epoch': 0.61}
{'loss': 0.8664, 'learning_rate': 1.866025403784439e-05, 'epoch': 0.66}
{'loss': 0.9016, 'learning_rate': 1.8467241992282842e-05, 'epoch': 0.7}
{'loss': 0.8926, 'learning_rate': 1.826238774315995e-05, 'epoch': 0.75}
{'loss': 0.7756, 'learning_rate': 1.8045977797666685e-05, 'epoch': 0.8}
{'loss': 0.6959, 'learning_rate': 1.78183148246803e-05, 'epoch': 0.84}
{'loss': 1.2584, 'learning_rate': 1.757971723145453e-05, 'epoch': 0.89}

the loss is shake a lot, the optimizer I modified is transformers.AdamW, can some one tell me the reason why the loss is so unstable? Is my modification wrong somewhere?

0

There are 0 answers