From b4f682582c00be65a444b17f4710f3f4cf124e83 Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Tue, 22 Oct 2019 01:49:27 -0500
Subject: [PATCH 1/8] Start CRF layer work

Want to try and add a new CRF layer for sequence tagging / prediction,
and will implement Viterbi decoding and NLL as the loss value.
---
 src/nn/layers/crf.nim               | 141 ++++++++++++++++++++++++++++
 src/nn/nn.nim                       |   4 +-
 src/nn_primitives/nn_primitives.nim |   6 +-
 src/nn_primitives/nnp_crf.nim       |  28 ++++++
 4 files changed, 175 insertions(+), 4 deletions(-)
 create mode 100644 src/nn/layers/crf.nim
 create mode 100644 src/nn_primitives/nnp_crf.nim

diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim
new file mode 100644
index 000000000..8b68f794e
--- /dev/null
+++ b/src/nn/layers/crf.nim
@@ -0,0 +1,141 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import strformat
+import options
+
+import  ../../tensor/tensor,
+        ../../nn_primitives/nn_primitives,
+        ../../autograd/autograd
+
+
+type Idx* = SomeInteger or SomeOrdinal
+
+type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT]
+  ## CRF (Linear) Gate for sequence prediction.
+  transitions: Variable[TT]
+  num_tags: Idx
+
+  # Special values for 
+  bos_tag: Idx
+  eos_tag: Idx
+
+proc crf_forward[TT, Idx](
+  result: var Variable[TT],
+  input: Variable[TT],
+  mask: Variable[TT],
+  transitions: Variable[TT],
+  tags: Tensor[Idx],
+  num_tags: int,
+  reduce = false
+) =
+  ## Compute the negative log likelihood for each input sequence. 
+  ## If `reduce` is true, return 
+  var gate: CRFGate[TT, Idx]
+  new gate
+
+  gate.transitions = transitions
+  gate.num_tags = num_tags
+
+  gate.bos_tag = Idx(num_tags)
+  gate.eos_tag = Idx(num_tags + 1)
+
+  let 
+    timesteps = input.value.shape[0]
+    batch_size = input.value.shape[1]
+    hidden_dim = input.value.shape[2]
+
+  #[crf_forward(
+    input.value,
+
+  )]#
+
+proc crf_viterbi*[TT]() = discard
+
+proc crf*[TT](
+  input: Variable[TT],
+  mask: Variable[TT],
+  transitions: Variable[TT],
+  tags: Option[Tensor[Idx]],
+  num_tags: int,
+  reduce: bool = false
+): Variable[TT] =
+  ## Input:
+  ##   - An `x` Variable of shape [timesteps, batch_size, hidden_size]
+  ##   - A `mask` Variable of shape [timesteps, batch_size] with is_grad_needed
+  ##     set to 0.
+  ##   - A `transitions` matrix of size (num_tags + 2, num_tags + 2)
+  ##     The extra tags are for BOS / EOS tags.
+  ##   - A `tags` tensor of shape [timesteps, batch_size, num_tags + 2] - only needed if
+  ##     doing training.  If not training, then this can be nil.
+  ## 
+  ## Return:
+  ##   - Negative log likelihood Tensor [batch_size, ]
+  ##   - Logits for tag prediction of shape [batch_size, sequence_length, num_tags]
+  when compileOption("boundChecks"):
+    doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3, got shape of {input.value.shape}"
+    doAssert mask.value.shape[0..1] == input.value.shape[0..1], fmt"Mask and input shapes do not match:" &
+      fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}"
+    doAssert transitions.value.shape == [num_tags + 2, num_tags + 2], "Expected transitions matrix shape to " &
+      fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}"
+  
+  assert mask.requires_grad == false, "Mask should not need a gradient"
+
+  new result
+  result.context = input.context
+
+  let doing_training = input.is_grad_needed() or transitions.is_grad_needed()
+
+  if doing_training:
+    if tags.isNone:
+      raise newException(ValueError, "Tags must be non-nil when training")
+  else:
+    let tags_tensor = tags.get()
+    result.crf_forward(input, mask, transitions, tags_tensor, num_tags)
+
+
+when isMainModule:
+  import unittest
+
+  let ctx = newContext Tensor[float32]
+
+  let (timesteps, batch_size, hidden_dim) = (8, 30, 10)
+
+  let
+    input = ctx.variable(
+      randomTensor[float32](timesteps, batch_size, hidden_dim, max=1.1),
+      requires_grad = true
+    )
+
+    mask = ctx.variable(ones[float32](timesteps, batch_size))
+
+    num_tags: int = 5
+
+    transitions = ctx.variable(
+      (randomTensor(num_tags + 2, num_tags + 2, max=2.0'f32) .- 1.0'f32),
+      requires_grad = false
+    )
+  
+  suite "Basic CRF tests":
+
+    test "When pass in some(Tensor[int]) can call CRF":
+      var tags = option(randomTensor(timesteps, batch_size, max=num_tags))
+      let output = crf(input, mask, transitions, tags, num_tags)
+      assert output.value.shape == [batch_size, ],
+        fmt"Got output shape {output.value.shape}"
+    
+    test "When pass in none(Tensor[int]) get ValueError":
+      expect ValueError:
+        let output2 = crf(input, mask, transitions, none(Tensor[int]), num_tags)
diff --git a/src/nn/nn.nim b/src/nn/nn.nim
index 60c66949c..2a3ff875c 100644
--- a/src/nn/nn.nim
+++ b/src/nn/nn.nim
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 import  ./activation/[sigmoid, relu, tanh],
-        ./layers/[linear, conv2D, maxpool2D, gru, embedding],
+        ./layers/[linear, conv2D, maxpool2D, gru, embedding, crf],
         ./loss/cross_entropy_losses,
         ./loss/mean_square_error_loss,
         ./optimizers/optimizers,
         ./init
 
 export  sigmoid, relu, tanh,
-        linear, conv2D, maxpool2d, gru, embedding,
+        linear, conv2D, maxpool2d, gru, embedding, crf,
         cross_entropy_losses, mean_square_error_loss,
         optimizers,
         init
diff --git a/src/nn_primitives/nn_primitives.nim b/src/nn_primitives/nn_primitives.nim
index 0fc357390..19ea17b6c 100644
--- a/src/nn_primitives/nn_primitives.nim
+++ b/src/nn_primitives/nn_primitives.nim
@@ -21,7 +21,8 @@ import  ./nnp_activation,
         ./nnp_softmax,
         ./nnp_numerical_gradient,
         ./nnp_gru,
-        ./nnp_embedding.nim
+        ./nnp_embedding,
+        ./nnp_crf
 
 export  nnp_activation,
         nnp_convolution,
@@ -32,7 +33,8 @@ export  nnp_activation,
         nnp_softmax,
         nnp_numerical_gradient,
         nnp_gru,
-        nnp_embedding
+        nnp_embedding,
+        nnp_crf
 
 import private/p_nnp_types
 export Size2D
diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim
new file mode 100644
index 000000000..422f39583
--- /dev/null
+++ b/src/nn_primitives/nnp_crf.nim
@@ -0,0 +1,28 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ../tensor/tensor,
+        math
+
+type Idx = SomeInteger or SomeOrdinal
+
+proc crf_forward*[T: SomeFloat](
+  result: var Tensor[T],
+  input: Tensor[T],
+  mask: Tensor[T],
+  transitions: Tensor[T],
+  tags: Tensor[Idx]
+) =
+  ##
+  discard

From 177846d35c8a1f3d8ba714b46a26fe1bf6df24c2 Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Tue, 22 Oct 2019 03:02:06 -0500
Subject: [PATCH 2/8] Add initializer function for transitions matrix

There is some logic needed to properly create the transitions matrix, so
add an initializer function using range + xavier uniform, and disallow
Any -> BOS or EOS -> Any transitions.
---
 src/nn/layers/crf.nim         | 41 ++++++++++++++++++++++++++++++-----
 src/nn_primitives/nnp_crf.nim |  2 +-
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim
index 8b68f794e..ccabb43f5 100644
--- a/src/nn/layers/crf.nim
+++ b/src/nn/layers/crf.nim
@@ -18,10 +18,11 @@ import options
 
 import  ../../tensor/tensor,
         ../../nn_primitives/nn_primitives,
+        ../../nn/init,
         ../../autograd/autograd
 
 
-type Idx* = SomeInteger or SomeOrdinal
+type Idx* = SomeInteger
 
 type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT]
   ## CRF (Linear) Gate for sequence prediction.
@@ -32,6 +33,30 @@ type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT]
   bos_tag: Idx
   eos_tag: Idx
 
+
+proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1)): Tensor[T] =
+  ## Create emissions matrix within bounds [-range, range], uniformly
+  ## distributed.  The special transitions from [any, start] and [end, any] are
+  ## set to be an arbitrarily low value to prevent prohibited transitions.
+  ##
+  ## Input:
+  ##   The `num_tags` indicating how many real (non-special) tag values there are.
+  ##   The `range_val` giving the scale to initialize transition values.
+  ##
+  ## Returns 
+  ##   The initialized transitions matrix of shape [num_tags + 2, num_tags + 2]
+
+  # TODO: In future, allow for rules prohibiting / mandating certain transitions.
+  let bos_tag, eos_tag = (num_tags, num_tags + 1)
+  result = xavier_uniform(num_tags + 2, num_tags + 2, T) * range_val
+
+  # Scale for a disallowed transition relative to the range value
+  const disallowed_transition_scale = 100_000
+
+  result[_, bos_tag] = disallowed_transition_scale * -1.0 * abs(range_val)
+  result[eos_tag, _] = disallowed_transition_scale * -1.0 * abs(range_val)
+
+
 proc crf_forward[TT, Idx](
   result: var Variable[TT],
   input: Variable[TT],
@@ -56,11 +81,15 @@ proc crf_forward[TT, Idx](
     timesteps = input.value.shape[0]
     batch_size = input.value.shape[1]
     hidden_dim = input.value.shape[2]
-
-  #[crf_forward(
+  
+  crf_forward(
+    result.value,
     input.value,
-
-  )]#
+    mask.value,
+    transitions.value,
+    tags,
+    reduce = reduce
+  )
 
 proc crf_viterbi*[TT]() = discard
 
@@ -81,7 +110,7 @@ proc crf*[TT](
   ##   - A `tags` tensor of shape [timesteps, batch_size, num_tags + 2] - only needed if
   ##     doing training.  If not training, then this can be nil.
   ## 
-  ## Return:
+  ## Returns:
   ##   - Negative log likelihood Tensor [batch_size, ]
   ##   - Logits for tag prediction of shape [batch_size, sequence_length, num_tags]
   when compileOption("boundChecks"):
diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim
index 422f39583..ea1a0dfaf 100644
--- a/src/nn_primitives/nnp_crf.nim
+++ b/src/nn_primitives/nnp_crf.nim
@@ -15,7 +15,7 @@
 import ../tensor/tensor,
         math
 
-type Idx = SomeInteger or SomeOrdinal
+type Idx = SomeInteger
 
 proc crf_forward*[T: SomeFloat](
   result: var Tensor[T],

From c0c8ccd354956633fda7264a2f67421b37679d4a Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Wed, 23 Oct 2019 10:09:23 -0500
Subject: [PATCH 3/8] Start CRF internal functions

Following other implementations, will do scores + log partition function
for the forward pass (getting NLL).
---
 src/nn_primitives/nnp_crf.nim | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim
index ea1a0dfaf..6bbfac47c 100644
--- a/src/nn_primitives/nnp_crf.nim
+++ b/src/nn_primitives/nnp_crf.nim
@@ -15,14 +15,42 @@
 import ../tensor/tensor,
         math
 
+# Needed for the partition function
+from private/p_logsumexp import logsumexp
+
+
 type Idx = SomeInteger
 
+
+proc compute_scores[T](
+  input: Tensor[T],
+  mask: Tensor[T],
+  transitions: Tensor[T],
+  tags: Tensor[Idx]
+): Tensor[T] =
+  ## Computes the un-normalized log probabilities (combination of emissions and
+  ## transition scores at each timestep).
+  ##
+  ## Returns:
+  ##  - A Tensor[T] of non-normalized emission scores of shape [batch_size, ]
+  discard
+
+
+proc compute_log_partition_function[T](): Tensor[T] =
+  ## Compute the partition function by using the forward algorithm to avoid
+  ## explicitly calculating probabilties for all possible sequence
+  ## configurations.
+  discard
+
+
 proc crf_forward*[T: SomeFloat](
   result: var Tensor[T],
   input: Tensor[T],
   mask: Tensor[T],
   transitions: Tensor[T],
-  tags: Tensor[Idx]
+  tags: Tensor[Idx],
+  reduce: bool
 ) =
-  ##
+  ## Computes the log likelihood of input given transitions (emissions) matrix.
+  ## Loss should be *negative* log likelihood.
   discard

From 7f8a272b08914d94891e6d9a317dcdfbb0920fc6 Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Wed, 23 Oct 2019 23:52:18 -0500
Subject: [PATCH 4/8] Add index_select with passed-in storage

Uses array passed in and only reshapes if needed (the new Tensor has a
larger size than the old one).  Needed / think should help when doing
index_select with each subset of the same size.

Example here is selecting batch_size for each time step in CRF
emissions.
---
 src/tensor/shapeshifting.nim | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/tensor/shapeshifting.nim b/src/tensor/shapeshifting.nim
index 5b826b25a..36caa68ba 100644
--- a/src/tensor/shapeshifting.nim
+++ b/src/tensor/shapeshifting.nim
@@ -318,3 +318,23 @@ func index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int,
     var r_slice = result.atAxisIndex(axis, i)
     var t_slice = t.atAxisIndex(axis, int(index))
     r_slice.copyFrom(t_slice)
+
+proc index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int, indices: Tensor[Idx], result: var Tensor[T]) =
+  ## Same as the `index_select` function, but use a preallocated tensor for
+  ## output.
+  doAssert indices.shape.len == 1
+
+  var select_shape = t.shape
+  select_shape[axis] = indices.shape[0]
+
+  if (select_shape != result.shape):
+    ## FIXME: Better way of resizing the result when necessary
+    if (select_shape.size == result.shape):
+      result.reshape(select_shape)
+    else:
+      result = newTensorUninit(select_shape)
+  
+  for i, index in enumerate(indices):
+    var r_slice = result.atAxisIndex(axis, i)
+    var t_slice = t.atAxisIndex(axis, int(index))
+    r_slice.copyFrom(t_slice)

From 692c7d35eca63b5ca87d8eb52f8de7a0024779fd Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Thu, 24 Oct 2019 00:06:04 -0500
Subject: [PATCH 5/8] Formatting and some API changes

Ran 'nimpretty' to clean up formatting / long lines, and passed more
information to the nnp_crf functions.
---
 src/nn/layers/crf.nim | 82 +++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim
index ccabb43f5..851e5b1e4 100644
--- a/src/nn/layers/crf.nim
+++ b/src/nn/layers/crf.nim
@@ -16,7 +16,7 @@
 import strformat
 import options
 
-import  ../../tensor/tensor,
+import ../../tensor/tensor,
         ../../nn_primitives/nn_primitives,
         ../../nn/init,
         ../../autograd/autograd
@@ -24,17 +24,20 @@ import  ../../tensor/tensor,
 
 type Idx* = SomeInteger
 
-type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT]
+type CRFGate*[TT; Idx] {.final.} = ref object of Gate[TT]
   ## CRF (Linear) Gate for sequence prediction.
   transitions: Variable[TT]
   num_tags: Idx
 
-  # Special values for 
+  # Special values for state transitions
   bos_tag: Idx
   eos_tag: Idx
 
+  dims: tuple[timesteps, batch_size, hidden_dim: Idx]
 
-proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1)): Tensor[T] =
+
+proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx; range_val: T = T(
+    0.1)): Tensor[T] =
   ## Create emissions matrix within bounds [-range, range], uniformly
   ## distributed.  The special transitions from [any, start] and [end, any] are
   ## set to be an arbitrarily low value to prevent prohibited transitions.
@@ -43,7 +46,7 @@ proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1)
   ##   The `num_tags` indicating how many real (non-special) tag values there are.
   ##   The `range_val` giving the scale to initialize transition values.
   ##
-  ## Returns 
+  ## Returns
   ##   The initialized transitions matrix of shape [num_tags + 2, num_tags + 2]
 
   # TODO: In future, allow for rules prohibiting / mandating certain transitions.
@@ -58,16 +61,16 @@ proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1)
 
 
 proc crf_forward[TT, Idx](
-  result: var Variable[TT],
-  input: Variable[TT],
-  mask: Variable[TT],
-  transitions: Variable[TT],
-  tags: Tensor[Idx],
-  num_tags: int,
+  result: var Variable[TT];
+  input: Variable[TT];
+  mask: Variable[TT];
+  transitions: Variable[TT];
+  tags: Tensor[Idx];
+  num_tags: int;
   reduce = false
 ) =
-  ## Compute the negative log likelihood for each input sequence. 
-  ## If `reduce` is true, return 
+  ## Compute the negative log likelihood for each input sequence.
+  ## If `reduce` is true, return
   var gate: CRFGate[TT, Idx]
   new gate
 
@@ -77,49 +80,58 @@ proc crf_forward[TT, Idx](
   gate.bos_tag = Idx(num_tags)
   gate.eos_tag = Idx(num_tags + 1)
 
-  let 
+  let
     timesteps = input.value.shape[0]
     batch_size = input.value.shape[1]
     hidden_dim = input.value.shape[2]
-  
+
+  gate.dims = (timesteps: timesteps, batch_size: batch_size,
+               hidden_dim: hidden_dim)
+
   crf_forward(
     result.value,
     input.value,
     mask.value,
     transitions.value,
     tags,
-    reduce = reduce
+    gate.dims.timesteps, gate.dims.batch_size, gate.dims.hidden_dim,
+    gate.bos_tag, gate.eos_tag
   )
 
 proc crf_viterbi*[TT]() = discard
 
 proc crf*[TT](
-  input: Variable[TT],
-  mask: Variable[TT],
-  transitions: Variable[TT],
-  tags: Option[Tensor[Idx]],
-  num_tags: int,
+  input: Variable[TT];
+  mask: Variable[TT];
+  transitions: Variable[TT];
+  tags: Option[Tensor[Idx]];
+  num_tags: int;
   reduce: bool = false
 ): Variable[TT] =
   ## Input:
-  ##   - An `x` Variable of shape [timesteps, batch_size, hidden_size]
+  ##   - An `x` Variable of shape [timesteps, batch_size, num_tags]
   ##   - A `mask` Variable of shape [timesteps, batch_size] with is_grad_needed
   ##     set to 0.
   ##   - A `transitions` matrix of size (num_tags + 2, num_tags + 2)
   ##     The extra tags are for BOS / EOS tags.
-  ##   - A `tags` tensor of shape [timesteps, batch_size, num_tags + 2] - only needed if
+  ##   - A `tags` tensor of shape [timesteps, batch_size] - only needed if
   ##     doing training.  If not training, then this can be nil.
-  ## 
+  ##
   ## Returns:
   ##   - Negative log likelihood Tensor [batch_size, ]
   ##   - Logits for tag prediction of shape [batch_size, sequence_length, num_tags]
   when compileOption("boundChecks"):
-    doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3, got shape of {input.value.shape}"
-    doAssert mask.value.shape[0..1] == input.value.shape[0..1], fmt"Mask and input shapes do not match:" &
-      fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}"
-    doAssert transitions.value.shape == [num_tags + 2, num_tags + 2], "Expected transitions matrix shape to " &
-      fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}"
-  
+    doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3" &
+      ", got shape of {input.value.shape}"
+    doAssert input.value.shape[2] == num_tags, fmt"Expected input variable to" &
+      " emit {num_tags}, emitted {input.value.shape[2]}"
+    doAssert mask.value.shape[0..1] == input.value.shape[0..1],
+        fmt"Mask and input shapes do not match:" &
+        fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}"
+    doAssert transitions.value.shape == [num_tags + 2, num_tags + 2],
+        "Expected transitions matrix shape to " &
+        fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}"
+
   assert mask.requires_grad == false, "Mask should not need a gradient"
 
   new result
@@ -144,7 +156,7 @@ when isMainModule:
 
   let
     input = ctx.variable(
-      randomTensor[float32](timesteps, batch_size, hidden_dim, max=1.1),
+      randomTensor[float32](timesteps, batch_size, hidden_dim, max = 1.1),
       requires_grad = true
     )
 
@@ -153,18 +165,18 @@ when isMainModule:
     num_tags: int = 5
 
     transitions = ctx.variable(
-      (randomTensor(num_tags + 2, num_tags + 2, max=2.0'f32) .- 1.0'f32),
+      (randomTensor(num_tags + 2, num_tags + 2, max = 2.0'f32) .- 1.0'f32),
       requires_grad = false
     )
-  
+
   suite "Basic CRF tests":
 
     test "When pass in some(Tensor[int]) can call CRF":
-      var tags = option(randomTensor(timesteps, batch_size, max=num_tags))
+      var tags = option(randomTensor(timesteps, batch_size, max = num_tags))
       let output = crf(input, mask, transitions, tags, num_tags)
       assert output.value.shape == [batch_size, ],
         fmt"Got output shape {output.value.shape}"
-    
+
     test "When pass in none(Tensor[int]) get ValueError":
       expect ValueError:
         let output2 = crf(input, mask, transitions, none(Tensor[int]), num_tags)

From 1c597a49f6d45f9d1c9e3489ad12bc32d005e210 Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Thu, 24 Oct 2019 01:57:17 -0500
Subject: [PATCH 6/8] Bug fixes

---
 src/nn/layers/crf.nim        | 23 ++++++++++++-----------
 src/tensor/shapeshifting.nim |  6 +++---
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim
index 851e5b1e4..ad9ee8ed1 100644
--- a/src/nn/layers/crf.nim
+++ b/src/nn/layers/crf.nim
@@ -87,7 +87,7 @@ proc crf_forward[TT, Idx](
 
   gate.dims = (timesteps: timesteps, batch_size: batch_size,
                hidden_dim: hidden_dim)
-
+  
   crf_forward(
     result.value,
     input.value,
@@ -122,9 +122,9 @@ proc crf*[TT](
   ##   - Logits for tag prediction of shape [batch_size, sequence_length, num_tags]
   when compileOption("boundChecks"):
     doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3" &
-      ", got shape of {input.value.shape}"
+      fmt", got shape of {input.value.shape}"
     doAssert input.value.shape[2] == num_tags, fmt"Expected input variable to" &
-      " emit {num_tags}, emitted {input.value.shape[2]}"
+      fmt" emit {num_tags}, emitted {input.value.shape[2]}"
     doAssert mask.value.shape[0..1] == input.value.shape[0..1],
         fmt"Mask and input shapes do not match:" &
         fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}"
@@ -142,9 +142,12 @@ proc crf*[TT](
   if doing_training:
     if tags.isNone:
       raise newException(ValueError, "Tags must be non-nil when training")
+    else:
+      let tags_tensor = tags.get()
+      result.crf_forward(input, mask, transitions, tags_tensor, num_tags)
   else:
-    let tags_tensor = tags.get()
-    result.crf_forward(input, mask, transitions, tags_tensor, num_tags)
+    # TODO: Inference time
+    discard
 
 
 when isMainModule:
@@ -152,27 +155,25 @@ when isMainModule:
 
   let ctx = newContext Tensor[float32]
 
-  let (timesteps, batch_size, hidden_dim) = (8, 30, 10)
+  let (timesteps, batch_size, num_tags) = (8, 30, 10)
 
   let
     input = ctx.variable(
-      randomTensor[float32](timesteps, batch_size, hidden_dim, max = 1.1),
+      randomTensor[float32](timesteps, batch_size, num_tags, max = 1.1),
       requires_grad = true
     )
 
     mask = ctx.variable(ones[float32](timesteps, batch_size))
 
-    num_tags: int = 5
-
     transitions = ctx.variable(
       (randomTensor(num_tags + 2, num_tags + 2, max = 2.0'f32) .- 1.0'f32),
       requires_grad = false
     )
-
+    
   suite "Basic CRF tests":
 
     test "When pass in some(Tensor[int]) can call CRF":
-      var tags = option(randomTensor(timesteps, batch_size, max = num_tags))
+      var tags = option(randomTensor(timesteps, batch_size, max = num_tags - 1))
       let output = crf(input, mask, transitions, tags, num_tags)
       assert output.value.shape == [batch_size, ],
         fmt"Got output shape {output.value.shape}"
diff --git a/src/tensor/shapeshifting.nim b/src/tensor/shapeshifting.nim
index 36caa68ba..5b0ebf2dc 100644
--- a/src/tensor/shapeshifting.nim
+++ b/src/tensor/shapeshifting.nim
@@ -329,10 +329,10 @@ proc index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int,
 
   if (select_shape != result.shape):
     ## FIXME: Better way of resizing the result when necessary
-    if (select_shape.size == result.shape):
-      result.reshape(select_shape)
+    if (select_shape.product() == result.size()):
+      result = result.reshape(select_shape)
     else:
-      result = newTensorUninit(select_shape)
+      result = newTensorUninit[T](select_shape)
   
   for i, index in enumerate(indices):
     var r_slice = result.atAxisIndex(axis, i)

From 1db105e2c0124d72c02c2c075c5b9a5a2503ebb8 Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Thu, 24 Oct 2019 02:01:44 -0500
Subject: [PATCH 7/8] Start on crf_forward

Implementation of forward pass underway, starting with scores (non
normalized log prob with emission + transition components).
---
 src/nn_primitives/nnp_crf.nim | 84 +++++++++++++++++++++++++++++++----
 1 file changed, 75 insertions(+), 9 deletions(-)

diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim
index 6bbfac47c..58ee34259 100644
--- a/src/nn_primitives/nnp_crf.nim
+++ b/src/nn_primitives/nnp_crf.nim
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import strformat
+
 import ../tensor/tensor,
         math
 
@@ -23,18 +25,80 @@ type Idx = SomeInteger
 
 
 proc compute_scores[T](
-  input: Tensor[T],
-  mask: Tensor[T],
-  transitions: Tensor[T],
-  tags: Tensor[Idx]
-): Tensor[T] =
+  result: var Tensor[T],  # (B, ) - not nil
+  input: Tensor[T],       # (T, B, num_tags)
+  mask: Tensor[T],        # (T, B)
+  transitions: Tensor[T], # (num_tags + 2, num_tags + 2)
+  tags: Tensor[Idx],      # (T, B)
+  timesteps, batch_size, hidden_dim: int,
+  bos_tag, eos_tag: Idx
+) =
   ## Computes the un-normalized log probabilities (combination of emissions and
   ## transition scores at each timestep).
   ##
   ## Returns:
   ##  - A Tensor[T] of non-normalized emission scores of shape [batch_size, ]
-  discard
 
+  echo (timesteps, batch_size, hidden_dim)
+  echo input.shape
+
+  # Transitions from bos_tag -> tag at time = 0 for all batches
+  var transition_scores = index_select(transitions[bos_tag, _], axis = 1,
+                                      indices = tags[0, _].squeeze()).squeeze()
+
+  when compileOption("boundChecks"):
+    doAssert result.shape == [batch_size], "Result should be of shape" &
+      fmt" {batch_size} but got {result.shape}"
+    doAssert transition_scores.shape == [batch_size], "Transition scores" &
+      fmt" should be of shape {batch_size} but got {transition_scores.shape}"
+
+  # Emission scores for tag at t = 0 for all in batch
+  # FIXME: This is giving a value of wrong shape below
+  var emission_scores = input[0, _, _]
+                          .squeeze()
+                          .index_select(axis = 1, indices = tags[0, _].squeeze())
+                          .squeeze(axis=1)
+
+  when compileOption("boundChecks"): 
+    doAssert emission_scores.shape == [batch_size], "Emission scores should" &
+      fmt" be of shape {batch_size} but got {emission_scores.shape}"
+
+  emission_scores .*= mask[0, _].squeeze()
+
+  result += transition_scores + emission_scores
+
+  # TODO: Optimize?
+  for i in 1 ..< timesteps:
+    let 
+      old_tags = tags[i - 1, _].squeeze(1)
+      new_tags = tags[i, _].squeeze(1)
+
+      old_mask = mask[i, _].squeeze()
+      new_mask = mask[i + 1, _].squeeze()
+
+    # New emission scores
+    input[i, _, _].squeeze().index_select(axis=1, tags[i, _].squeeze(),
+                                          result = emission_scores)
+
+    # New transition scores
+    # This is applying transtion from old -> new tag across batch
+    # Unoptimized version:
+    # for j in 0 .. batch_size:
+    #   transition_scores[j] = transitions[old_tags[j], new_tags[j]]
+    transition_scores.apply3_inline(old_tags, new_tags):
+      transitions[y, z]
+
+    result += transition_scores * new_mask + emission_scores * old_mask
+  
+  # Assume that masked when == 0
+  let last_time_inds = (mask.sum(axis=0).squeeze() .- 1).astype(int)
+  let last_tags = tags.index_select(axis=0, indices=last_time_inds).squeeze()
+  # let last_transitions = transitions[_, eos_tag].squeeze()
+
+  # Set transition scores to last_real_tag -> EOS_TAG across batch
+  transitions[_, eos_tag].squeeze().index_select(axis=0, indices=last_tags, result=transition_scores)
+  
+  result += transition_scores
 
 proc compute_log_partition_function[T](): Tensor[T] =
   ## Compute the partition function by using the forward algorithm to avoid
@@ -42,15 +106,17 @@ proc compute_log_partition_function[T](): Tensor[T] =
   ## configurations.
   discard
 
-
 proc crf_forward*[T: SomeFloat](
   result: var Tensor[T],
   input: Tensor[T],
   mask: Tensor[T],
   transitions: Tensor[T],
   tags: Tensor[Idx],
-  reduce: bool
+  timesteps, batch_size, hidden_dim: int,
+  bos_tag, eos_tag: Idx
 ) =
   ## Computes the log likelihood of input given transitions (emissions) matrix.
   ## Loss should be *negative* log likelihood.
-  discard
+  result = zeros[T](batch_size)
+  result.compute_scores(input, mask, transitions, tags, timesteps, batch_size,
+                        hidden_dim, bos_tag, eos_tag)

From e6180619cd0e450fcae1fb24fb802524a25adaca Mon Sep 17 00:00:00 2001
From: Sloane Simmons <sloanes.k@gmail.com>
Date: Tue, 29 Oct 2019 01:11:19 -0500
Subject: [PATCH 8/8] Update CRF

Fix some bugs with CRF non-normalized score calculation (mostly making
sure that not returning matrix when shouldn't when using index_select).

Also fix some out-of-bounds bug due to loop over time steps.
---
 src/nn_primitives/nnp_crf.nim | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim
index 58ee34259..8b4166bff 100644
--- a/src/nn_primitives/nnp_crf.nim
+++ b/src/nn_primitives/nnp_crf.nim
@@ -39,6 +39,7 @@ proc compute_scores[T](
   ## Returns:
   ##  - A Tensor[T] of non-normalized emission scores of shape [batch_size, ]
 
+  # DEBUG
   echo (timesteps, batch_size, hidden_dim)
   echo input.shape
 
@@ -53,12 +54,12 @@ proc compute_scores[T](
       fmt" should be of shape {batch_size} but got {transition_scores.shape}"
 
   # Emission scores for tag at t = 0 for all in batch
-  # FIXME: This is giving a value of wrong shape below
-  var emission_scores = input[0, _, _]
-                          .squeeze()
-                          .index_select(axis = 1, indices = tags[0, _].squeeze())
-                          .squeeze(axis=1)
+  # Unoptimized - simple loop
+  var emission_scores = newTensorUninit[input.T](batch_size)
 
+  for i in 0 ..< batch_size:
+    emission_scores[i] = input[0, i, tags[0, i]]
+  
   when compileOption("boundChecks"): 
     doAssert emission_scores.shape == [batch_size], "Emission scores should" &
       fmt" be of shape {batch_size} but got {emission_scores.shape}"
@@ -68,7 +69,7 @@ proc compute_scores[T](
   result += transition_scores + emission_scores
 
   # TODO: Optimize?
-  for i in 1 ..< timesteps:
+  for i in 1 ..< timesteps - 1:
     let 
       old_tags = tags[i - 1, _].squeeze(1)
       new_tags = tags[i, _].squeeze(1)
@@ -76,9 +77,9 @@ proc compute_scores[T](
       old_mask = mask[i, _].squeeze()
       new_mask = mask[i + 1, _].squeeze()
 
-    # New emission scores
-    input[i, _, _].squeeze().index_select(axis=1, tags[i, _].squeeze(),
-                                          result = emission_scores)
+    # New emission scores are the emission at time i for batch j to tag [i, j]
+    for j in 0 ..< batch_size:
+      emission_scores[i] = input[i, j, tags[i, j]]
 
     # New transition scores
     # This is applying transtion from old -> new tag across batch
@@ -88,12 +89,16 @@ proc compute_scores[T](
     transition_scores.apply3_inline(old_tags, new_tags):
       transitions[y, z]
 
-    result += transition_scores * new_mask + emission_scores * old_mask
+    result += (transition_scores .* new_mask) + (emission_scores .* old_mask)
+  
+  # TODO: Make sure that last transition handled correctly 
   
   # Assume that masked when == 0
   let last_time_inds = (mask.sum(axis=0).squeeze() .- 1).astype(int)
-  let last_tags = tags.index_select(axis=0, indices=last_time_inds).squeeze()
-  # let last_transitions = transitions[_, eos_tag].squeeze()
+  var last_tags = newTensorUninit[tags.T](batch_size)
+
+  for i in 0 ..< batch_size:
+    last_tags[i] = tags[last_time_inds[i], i]
 
   # Set transition scores to last_real_tag -> EOS_TAG across batch
   transitions[_, eos_tag].squeeze().index_select(axis=0, indices=last_tags, result=transition_scores)