From: François Fleuret Date: Tue, 28 Mar 2023 16:03:34 +0000 (+0200) Subject: Update X-Git-Url: https://ant.fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;h=49bfa9f885bb100f0a262dcfd4ce7d10f75319d0;p=beaver.git Update --- diff --git a/beaver.py b/beaver.py index 074e137..f395d22 100755 --- a/beaver.py +++ b/beaver.py @@ -233,7 +233,7 @@ def oneshot_trace_loss(mazes, output, policies, height, width): return (output - targets).abs().sum() / masks.sum() -def oneshot(gpt, task): +def oneshot(gpt, learning_rate_scheduler, task): t = gpt.training gpt.eval() @@ -261,8 +261,10 @@ def oneshot(gpt, task): nn.Linear(args.dim_model, dim_out), ).to(device) + learning_rate_scheduler.reset() + for n_epoch in range(args.nb_epochs): - learning_rate = learning_rate_schedule[n_epoch] + learning_rate = learning_rate_scheduler.learning_rate() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) acc_train_loss, nb_train_samples = 0, 0 @@ -280,6 +282,8 @@ def oneshot(gpt, task): loss.backward() optimizer.step() + learning_rate_scheduler.update(n_epoch + 1, acc_train_loss) + acc_test_loss, nb_test_samples = 0, 0 for mazes, policies in task.policy_batches(split="test"): output_gpt = eval_mygpt( @@ -337,6 +341,39 @@ def oneshot(gpt, task): ###################################################################### +class LearningRateScheduler: + def learning_rate(self): + pass + + def update(self, nb_finished_epochs, loss): + pass + + def reset(self): + pass + + def get_state(self): + return vars(self) + + def set_state(self, state): + for k, v in state.item(): + setattr(self, k, v) + + +class StepWiseScheduler(LearningRateScheduler): + def __init__(self, schedule): + self.nb_finished_epochs = 0 + self.schedule = schedule + + def learning_rate(self): + return self.schedule[self.nb_finished_epochs] + + def reset(self): + self.nb_finished_epochs = 0 + + +###################################################################### + + class Task: def batches(self, split="train", nb_to_use=-1, desc=None): pass @@ -551,6 +588,36 @@ log_string(f"nb_parameters {nb_parameters} ({int(nb_parameters/1e6)}M)") ###################################################################### +if args.learning_rate_schedule == "auto": + pass + +elif args.learning_rate_schedule == "cos": + schedule = {} + for n_epoch in range(args.nb_epochs): + u = n_epoch / args.nb_epochs * math.pi + schedule[n_epoch] = args.learning_rate * 0.5 * (1 + math.cos(u)) + learning_rate_scheduler = StepWiseScheduler(schedule) + log_string(f"learning_rate_schedule {schedule}") + +else: + u = { + int(k): float(v) + for k, v in [ + tuple(x.split(":")) for x in args.learning_rate_schedule.split(",") + ] + } + + schedule = {} + learning_rate = args.learning_rate + for n_epoch in range(args.nb_epochs): + if n_epoch in u: + learning_rate = u[n_epoch] + schedule[n_epoch] = learning_rate + learning_rate_scheduler = StepWiseScheduler(schedule) + log_string(f"learning_rate_schedule {schedule}") + +###################################################################### + nb_epochs_finished = 0 if args.no_checkpoint: @@ -586,30 +653,6 @@ train_set_perplexity = math.exp(entropy) ############################## -if args.learning_rate_schedule == "cos": - learning_rate_schedule = {} - for n_epoch in range(args.nb_epochs): - u = n_epoch / args.nb_epochs * math.pi - learning_rate_schedule[n_epoch] = args.learning_rate * 0.5 * (1 + math.cos(u)) -else: - u = { - int(k): float(v) - for k, v in [ - tuple(x.split(":")) for x in args.learning_rate_schedule.split(",") - ] - } - - learning_rate_schedule = {} - learning_rate = args.learning_rate - for n_epoch in range(args.nb_epochs): - if n_epoch in u: - learning_rate = u[n_epoch] - learning_rate_schedule[n_epoch] = learning_rate - -log_string(f"learning_rate_schedule {learning_rate_schedule}") - -############################## - if nb_epochs_finished >= args.nb_epochs: n_epoch = nb_epochs_finished train_perplexity = compute_perplexity( @@ -627,8 +670,10 @@ if nb_epochs_finished >= args.nb_epochs: ############################## +learning_rate_scheduler.reset() + for n_epoch in range(nb_epochs_finished, args.nb_epochs): - learning_rate = learning_rate_schedule[n_epoch] + learning_rate = learning_rate_scheduler.learning_rate() log_string(f"learning_rate {learning_rate}") @@ -660,6 +705,8 @@ for n_epoch in range(nb_epochs_finished, args.nb_epochs): loss.backward() optimizer.step() + learning_rate_scheduler.update(n_epoch + 1, acc_train_loss) + train_perplexity = math.exp(min(100, acc_train_loss / nb_train_samples)) test_perplexity = compute_perplexity( model, task, prompt_len=task.height * task.width, split="test" @@ -687,6 +734,6 @@ for n_epoch in range(nb_epochs_finished, args.nb_epochs): ###################################################################### if args.oneshot: - oneshot(model, task) + oneshot(model, learning_rate_scheduler, task) ######################################################################