Update.

author François Fleuret <francois@fleuret.org>

Sat, 13 Jan 2024 18:39:23 +0000 (19:39 +0100)

committer François Fleuret <francois@fleuret.org>

Sat, 13 Jan 2024 18:39:23 +0000 (19:39 +0100)
author François Fleuret <francois@fleuret.org>
Sat, 13 Jan 2024 18:39:23 +0000 (19:39 +0100)
committer François Fleuret <francois@fleuret.org>
Sat, 13 Jan 2024 18:39:23 +0000 (19:39 +0100)
diff --git a/fridge b/fridge

index dcaac19..194c4e6 100644 (file)
--- a/fridge
+++ b/fridge
@@ -166,3 +166,14 @@ def insert_flash_back(rec_V, V, rec_K, K, t0, t1, CL, proba):
                  + (1 - mask) * self.rec_K[:, :, t0:t1]
              )
  
+
+######################################################################
+
+2024 Jan 13 13:38:31 (from mygpt.py)
+
+        g= F.sigmoid(self.b_G)
+        a=1-g
+
+        print(f"\n\nSANITY {a**T}\n")
+        exit(0)
+
diff --git a/main.py b/main.py

index 969b47f..c22ae57 100755 (executable)
--- a/main.py
+++ b/main.py
@@ -202,9 +202,11 @@ parser.add_argument("--mixing_deterministic_start", action="store_true", default
  
  ######################################################################
  
-args = parser.parse_args()
+# args = parser.parse_args()
  
-assert args.picocvlr_prune_properties in {"none", "train+eval", "eval"}
+args, sup_args = parser.parse_known_args()
+
+sup_args = dict([x.removeprefix("--").split("=") for x in sup_args])
  
  if args.result_dir is None:
      args.result_dir = f"results_{args.task}_{args.model}"
@@ -432,6 +434,8 @@ except FileExistsError:
          print(f"result directory {args.result_dir} already exists")
          exit(1)
  
+loss_file = open(os.path.join(args.result_dir, "loss.dat"), "a")
+
  log_file = open(os.path.join(args.result_dir, args.log_filename), "a")
  
  if args.seed >= 0:
@@ -461,13 +465,16 @@ with os.popen("sha256sum *.py") as f:
          log_string(f"sha256sum {l.strip()}")
  
  now = time.strftime("%Y%m%d-%H%M%S", time.localtime())
-os.system(f"tar zcvf {args.result_dir}/src-{now}.tgz *.py *.sh")
+os.system(f"tar --ignore-failed-read zcvf {args.result_dir}/src-{now}.tgz *.py *.sh")
  
  log_string(f"argv {' '.join(sys.argv)}")
  
  for n in vars(args):
      log_string(f"args.{n} {getattr(args, n)}")
  
+for n in vars(sup_args):
+    log_string(f"sup_args.{n} {getattr(sup_args, n)}")
+
  
  ######################################################################
  
@@ -505,6 +512,9 @@ def get_lr(n_epoch, it):
  ######################################################################
  
  
+assert args.picocvlr_prune_properties in {"none", "train+eval", "eval"}
+
+
  def picoclvr_pruner_horizontal_green(p):
      return not ("green" in p and ("left" in p or "right" in p))
  
@@ -730,6 +740,8 @@ model = mygpt.MyGPT(
      causal=True,
      dropout=args.dropout,
      attention_layer=args.attention,
+    logger=log_string,
+    **sup_args,
  )
  
  model.to(device)
@@ -838,6 +850,8 @@ time_pred_result = datetime.datetime.now()
  
  it = 0
  
+n_batch = 0
+
  for n_epoch in range(nb_epochs_finished, nb_epochs):
      if args.optim == "sgd":
          optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
@@ -879,6 +893,12 @@ for n_epoch in range(nb_epochs_finished, nb_epochs):
          total_loss.backward()
          optimizer.step()
  
+        grad_norm = sum([p.grad.pow(2).sum() for p in model.parameters()]).sqrt()
+
+        loss_file.write(f"{n_epoch} {n_batch} {loss.item()} {grad_norm.item()}\n")
+
+        n_batch += 1
+
      with torch.autograd.no_grad():
          model.eval()
  
diff --git a/mygpt.py b/mygpt.py

index a62cf49..7c9991f 100755 (executable)
--- a/mygpt.py
+++ b/mygpt.py
@@ -190,6 +190,8 @@ class DumbRec(nn.Module):
          nb_lines,
          attention_dropout=0.0,
          len_max=1e5,
+        logger=print,
+        **kwargs,
      ):
          super().__init__()
  
@@ -319,6 +321,8 @@ class KVRec(nn.Module):
          nb_lines,
          attention_dropout=0.0,
          len_max=1e5,
+        logger=print,
+        **kwargs,
      ):
          super().__init__()
  
@@ -471,6 +475,8 @@ class Caterpillar(nn.Module):
          caterpillar_height,
          attention_dropout=0.0,
          len_max=1e5,
+        logger=print,
+        **kwargs,
      ):
          super().__init__()
  
@@ -487,12 +493,14 @@ class Caterpillar(nn.Module):
  
          self.proba_gate_dropout = 0.0
  
+        default_b_G = kwargs.get("default_b_G")
+        if default_b_G is None:
+            default_b_G = -math.log(caterpillar_height - 1)
+
+        logger(f"default_b_G {default_b_G}")
+
          self.w_G = randw(nb_heads, caterpillar_height, dim_model)
-        self.b_G = nn.Parameter(
-            torch.full(
-                (nb_heads, caterpillar_height), -math.log(caterpillar_height - 1)
-            )
-        )
+        self.b_G = nn.Parameter(torch.full((nb_heads, caterpillar_height), default_b_G))
  
          self.w_K = randw(nb_heads, dim_qk, dim_model)
          self.w_V = randw(nb_heads, dim_v, dim_model)
@@ -565,15 +573,20 @@ class Caterpillar(nn.Module):
              torch.einsum("ntc,hrc->nhrt", X, self.w_G) + self.b_G[None, :, :, None]
          ).sigmoid()
  
+        # Clip the gating to avoid values greater than 1 when several
+        # heads hit the same row
+
+        G = G / G.sum(1, keepdim=True).clamp(min=1)
+
          ######################################################################
          # Roll the gating indexes
  
-        warnings.warn("rotating barrel", RuntimeWarning)
+        # warnings.warn("rotating barrel", RuntimeWarning)
  
-        r_barrel = torch.arange(R, device=G.device)[None, None, :, None]
-        t_barrel = torch.arange(t1 - t0, device=G.device)[None, None, None, :]
-        r_barrel = (r_barrel + (t_barrel + t0) // L) % R
-        G = G.gather(dim=2, index=r_barrel.expand_as(G))
+        # r_barrel = torch.arange(R, device=G.device)[None, None, :, None]
+        # t_barrel = torch.arange(t1 - t0, device=G.device)[None, None, None, :]
+        # r_barrel = (r_barrel + (t_barrel + t0) // L) % R
+        # G = G.gather(dim=2, index=r_barrel.expand_as(G))
  
          ######################################################################
          # The "flashbacks"
@@ -611,11 +624,6 @@ class Caterpillar(nn.Module):
  
          # We prepare the arguments for the parallel scan
  
-        # Clip the gating to avoid values greater than 1 when several
-        # heads hit the same row
-
-        G = G / G.sum(1, keepdim=True).clamp(min=1)
-
          A = 1 - G.sum(1)
  
          # warnings.warn("harmonic recurrence", RuntimeWarning)
@@ -709,6 +717,8 @@ class QKVAttention(nn.Module):
          nb_heads=1,
          causal=False,
          attention_dropout=0.0,
+        logger=print,
+        **kwargs,
      ):
          super().__init__()
  
@@ -800,6 +810,8 @@ class MyGPT(nn.Module):
          dropout=0.0,
          len_max=1e5,
          attention_layer="kvrec",
+        logger=print,
+        **kwargs,
      ):
          super().__init__()
  
@@ -836,6 +848,8 @@ class MyGPT(nn.Module):
                      nb_heads=nb_heads,
                      causal=causal,
                      attention_dropout=dropout,
+                    logger=logger,
+                    **kwargs,
                  )
              elif attention_layer == "dumbrec":
                  return DumbRec(
@@ -845,6 +859,8 @@ class MyGPT(nn.Module):
                      nb_heads=nb_heads,
                      nb_lines=nb_lines,
                      attention_dropout=dropout,
+                    logger=logger,
+                    **kwargs,
                  )
              elif attention_layer == "kvrec":
                  return KVRec(
@@ -854,6 +870,8 @@ class MyGPT(nn.Module):
                      nb_heads=nb_heads,
                      nb_lines=nb_lines,
                      attention_dropout=dropout,
+                    logger=logger,
+                    **kwargs,
                  )
              elif attention_layer == "caterpillar":
                  return Caterpillar(
@@ -864,6 +882,8 @@ class MyGPT(nn.Module):
                      caterpillar_length=self.caterpillar_length,
                      caterpillar_height=self.caterpillar_height,
                      attention_dropout=dropout,
+                    logger=logger,
+                    **kwargs,
                  )
              else:
                  raise ValueError(f"Unknown attention type {attention_layer}.")
author	François Fleuret <francois@fleuret.org>
	Sat, 13 Jan 2024 18:39:23 +0000 (19:39 +0100)
committer	François Fleuret <francois@fleuret.org>
	Sat, 13 Jan 2024 18:39:23 +0000 (19:39 +0100)
fridge		patch \| blob \| history
main.py		patch \| blob \| history
mygpt.py		patch \| blob \| history