From b4f682582c00be65a444b17f4710f3f4cf124e83 Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Tue, 22 Oct 2019 01:49:27 -0500 Subject: [PATCH 1/8] Start CRF layer work Want to try and add a new CRF layer for sequence tagging / prediction, and will implement Viterbi decoding and NLL as the loss value. --- src/nn/layers/crf.nim | 141 ++++++++++++++++++++++++++++ src/nn/nn.nim | 4 +- src/nn_primitives/nn_primitives.nim | 6 +- src/nn_primitives/nnp_crf.nim | 28 ++++++ 4 files changed, 175 insertions(+), 4 deletions(-) create mode 100644 src/nn/layers/crf.nim create mode 100644 src/nn_primitives/nnp_crf.nim diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim new file mode 100644 index 000000000..8b68f794e --- /dev/null +++ b/src/nn/layers/crf.nim @@ -0,0 +1,141 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import strformat +import options + +import ../../tensor/tensor, + ../../nn_primitives/nn_primitives, + ../../autograd/autograd + + +type Idx* = SomeInteger or SomeOrdinal + +type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT] + ## CRF (Linear) Gate for sequence prediction. + transitions: Variable[TT] + num_tags: Idx + + # Special values for + bos_tag: Idx + eos_tag: Idx + +proc crf_forward[TT, Idx]( + result: var Variable[TT], + input: Variable[TT], + mask: Variable[TT], + transitions: Variable[TT], + tags: Tensor[Idx], + num_tags: int, + reduce = false +) = + ## Compute the negative log likelihood for each input sequence. + ## If `reduce` is true, return + var gate: CRFGate[TT, Idx] + new gate + + gate.transitions = transitions + gate.num_tags = num_tags + + gate.bos_tag = Idx(num_tags) + gate.eos_tag = Idx(num_tags + 1) + + let + timesteps = input.value.shape[0] + batch_size = input.value.shape[1] + hidden_dim = input.value.shape[2] + + #[crf_forward( + input.value, + + )]# + +proc crf_viterbi*[TT]() = discard + +proc crf*[TT]( + input: Variable[TT], + mask: Variable[TT], + transitions: Variable[TT], + tags: Option[Tensor[Idx]], + num_tags: int, + reduce: bool = false +): Variable[TT] = + ## Input: + ## - An `x` Variable of shape [timesteps, batch_size, hidden_size] + ## - A `mask` Variable of shape [timesteps, batch_size] with is_grad_needed + ## set to 0. + ## - A `transitions` matrix of size (num_tags + 2, num_tags + 2) + ## The extra tags are for BOS / EOS tags. + ## - A `tags` tensor of shape [timesteps, batch_size, num_tags + 2] - only needed if + ## doing training. If not training, then this can be nil. + ## + ## Return: + ## - Negative log likelihood Tensor [batch_size, ] + ## - Logits for tag prediction of shape [batch_size, sequence_length, num_tags] + when compileOption("boundChecks"): + doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3, got shape of {input.value.shape}" + doAssert mask.value.shape[0..1] == input.value.shape[0..1], fmt"Mask and input shapes do not match:" & + fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}" + doAssert transitions.value.shape == [num_tags + 2, num_tags + 2], "Expected transitions matrix shape to " & + fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}" + + assert mask.requires_grad == false, "Mask should not need a gradient" + + new result + result.context = input.context + + let doing_training = input.is_grad_needed() or transitions.is_grad_needed() + + if doing_training: + if tags.isNone: + raise newException(ValueError, "Tags must be non-nil when training") + else: + let tags_tensor = tags.get() + result.crf_forward(input, mask, transitions, tags_tensor, num_tags) + + +when isMainModule: + import unittest + + let ctx = newContext Tensor[float32] + + let (timesteps, batch_size, hidden_dim) = (8, 30, 10) + + let + input = ctx.variable( + randomTensor[float32](timesteps, batch_size, hidden_dim, max=1.1), + requires_grad = true + ) + + mask = ctx.variable(ones[float32](timesteps, batch_size)) + + num_tags: int = 5 + + transitions = ctx.variable( + (randomTensor(num_tags + 2, num_tags + 2, max=2.0'f32) .- 1.0'f32), + requires_grad = false + ) + + suite "Basic CRF tests": + + test "When pass in some(Tensor[int]) can call CRF": + var tags = option(randomTensor(timesteps, batch_size, max=num_tags)) + let output = crf(input, mask, transitions, tags, num_tags) + assert output.value.shape == [batch_size, ], + fmt"Got output shape {output.value.shape}" + + test "When pass in none(Tensor[int]) get ValueError": + expect ValueError: + let output2 = crf(input, mask, transitions, none(Tensor[int]), num_tags) diff --git a/src/nn/nn.nim b/src/nn/nn.nim index 60c66949c..2a3ff875c 100644 --- a/src/nn/nn.nim +++ b/src/nn/nn.nim @@ -13,14 +13,14 @@ # limitations under the License. import ./activation/[sigmoid, relu, tanh], - ./layers/[linear, conv2D, maxpool2D, gru, embedding], + ./layers/[linear, conv2D, maxpool2D, gru, embedding, crf], ./loss/cross_entropy_losses, ./loss/mean_square_error_loss, ./optimizers/optimizers, ./init export sigmoid, relu, tanh, - linear, conv2D, maxpool2d, gru, embedding, + linear, conv2D, maxpool2d, gru, embedding, crf, cross_entropy_losses, mean_square_error_loss, optimizers, init diff --git a/src/nn_primitives/nn_primitives.nim b/src/nn_primitives/nn_primitives.nim index 0fc357390..19ea17b6c 100644 --- a/src/nn_primitives/nn_primitives.nim +++ b/src/nn_primitives/nn_primitives.nim @@ -21,7 +21,8 @@ import ./nnp_activation, ./nnp_softmax, ./nnp_numerical_gradient, ./nnp_gru, - ./nnp_embedding.nim + ./nnp_embedding, + ./nnp_crf export nnp_activation, nnp_convolution, @@ -32,7 +33,8 @@ export nnp_activation, nnp_softmax, nnp_numerical_gradient, nnp_gru, - nnp_embedding + nnp_embedding, + nnp_crf import private/p_nnp_types export Size2D diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim new file mode 100644 index 000000000..422f39583 --- /dev/null +++ b/src/nn_primitives/nnp_crf.nim @@ -0,0 +1,28 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ../tensor/tensor, + math + +type Idx = SomeInteger or SomeOrdinal + +proc crf_forward*[T: SomeFloat]( + result: var Tensor[T], + input: Tensor[T], + mask: Tensor[T], + transitions: Tensor[T], + tags: Tensor[Idx] +) = + ## + discard From 177846d35c8a1f3d8ba714b46a26fe1bf6df24c2 Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Tue, 22 Oct 2019 03:02:06 -0500 Subject: [PATCH 2/8] Add initializer function for transitions matrix There is some logic needed to properly create the transitions matrix, so add an initializer function using range + xavier uniform, and disallow Any -> BOS or EOS -> Any transitions. --- src/nn/layers/crf.nim | 41 ++++++++++++++++++++++++++++++----- src/nn_primitives/nnp_crf.nim | 2 +- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim index 8b68f794e..ccabb43f5 100644 --- a/src/nn/layers/crf.nim +++ b/src/nn/layers/crf.nim @@ -18,10 +18,11 @@ import options import ../../tensor/tensor, ../../nn_primitives/nn_primitives, + ../../nn/init, ../../autograd/autograd -type Idx* = SomeInteger or SomeOrdinal +type Idx* = SomeInteger type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT] ## CRF (Linear) Gate for sequence prediction. @@ -32,6 +33,30 @@ type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT] bos_tag: Idx eos_tag: Idx + +proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1)): Tensor[T] = + ## Create emissions matrix within bounds [-range, range], uniformly + ## distributed. The special transitions from [any, start] and [end, any] are + ## set to be an arbitrarily low value to prevent prohibited transitions. + ## + ## Input: + ## The `num_tags` indicating how many real (non-special) tag values there are. + ## The `range_val` giving the scale to initialize transition values. + ## + ## Returns + ## The initialized transitions matrix of shape [num_tags + 2, num_tags + 2] + + # TODO: In future, allow for rules prohibiting / mandating certain transitions. + let bos_tag, eos_tag = (num_tags, num_tags + 1) + result = xavier_uniform(num_tags + 2, num_tags + 2, T) * range_val + + # Scale for a disallowed transition relative to the range value + const disallowed_transition_scale = 100_000 + + result[_, bos_tag] = disallowed_transition_scale * -1.0 * abs(range_val) + result[eos_tag, _] = disallowed_transition_scale * -1.0 * abs(range_val) + + proc crf_forward[TT, Idx]( result: var Variable[TT], input: Variable[TT], @@ -56,11 +81,15 @@ proc crf_forward[TT, Idx]( timesteps = input.value.shape[0] batch_size = input.value.shape[1] hidden_dim = input.value.shape[2] - - #[crf_forward( + + crf_forward( + result.value, input.value, - - )]# + mask.value, + transitions.value, + tags, + reduce = reduce + ) proc crf_viterbi*[TT]() = discard @@ -81,7 +110,7 @@ proc crf*[TT]( ## - A `tags` tensor of shape [timesteps, batch_size, num_tags + 2] - only needed if ## doing training. If not training, then this can be nil. ## - ## Return: + ## Returns: ## - Negative log likelihood Tensor [batch_size, ] ## - Logits for tag prediction of shape [batch_size, sequence_length, num_tags] when compileOption("boundChecks"): diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim index 422f39583..ea1a0dfaf 100644 --- a/src/nn_primitives/nnp_crf.nim +++ b/src/nn_primitives/nnp_crf.nim @@ -15,7 +15,7 @@ import ../tensor/tensor, math -type Idx = SomeInteger or SomeOrdinal +type Idx = SomeInteger proc crf_forward*[T: SomeFloat]( result: var Tensor[T], From c0c8ccd354956633fda7264a2f67421b37679d4a Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Wed, 23 Oct 2019 10:09:23 -0500 Subject: [PATCH 3/8] Start CRF internal functions Following other implementations, will do scores + log partition function for the forward pass (getting NLL). --- src/nn_primitives/nnp_crf.nim | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim index ea1a0dfaf..6bbfac47c 100644 --- a/src/nn_primitives/nnp_crf.nim +++ b/src/nn_primitives/nnp_crf.nim @@ -15,14 +15,42 @@ import ../tensor/tensor, math +# Needed for the partition function +from private/p_logsumexp import logsumexp + + type Idx = SomeInteger + +proc compute_scores[T]( + input: Tensor[T], + mask: Tensor[T], + transitions: Tensor[T], + tags: Tensor[Idx] +): Tensor[T] = + ## Computes the un-normalized log probabilities (combination of emissions and + ## transition scores at each timestep). + ## + ## Returns: + ## - A Tensor[T] of non-normalized emission scores of shape [batch_size, ] + discard + + +proc compute_log_partition_function[T](): Tensor[T] = + ## Compute the partition function by using the forward algorithm to avoid + ## explicitly calculating probabilties for all possible sequence + ## configurations. + discard + + proc crf_forward*[T: SomeFloat]( result: var Tensor[T], input: Tensor[T], mask: Tensor[T], transitions: Tensor[T], - tags: Tensor[Idx] + tags: Tensor[Idx], + reduce: bool ) = - ## + ## Computes the log likelihood of input given transitions (emissions) matrix. + ## Loss should be *negative* log likelihood. discard From 7f8a272b08914d94891e6d9a317dcdfbb0920fc6 Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Wed, 23 Oct 2019 23:52:18 -0500 Subject: [PATCH 4/8] Add index_select with passed-in storage Uses array passed in and only reshapes if needed (the new Tensor has a larger size than the old one). Needed / think should help when doing index_select with each subset of the same size. Example here is selecting batch_size for each time step in CRF emissions. --- src/tensor/shapeshifting.nim | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/tensor/shapeshifting.nim b/src/tensor/shapeshifting.nim index 5b826b25a..36caa68ba 100644 --- a/src/tensor/shapeshifting.nim +++ b/src/tensor/shapeshifting.nim @@ -318,3 +318,23 @@ func index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int, var r_slice = result.atAxisIndex(axis, i) var t_slice = t.atAxisIndex(axis, int(index)) r_slice.copyFrom(t_slice) + +proc index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int, indices: Tensor[Idx], result: var Tensor[T]) = + ## Same as the `index_select` function, but use a preallocated tensor for + ## output. + doAssert indices.shape.len == 1 + + var select_shape = t.shape + select_shape[axis] = indices.shape[0] + + if (select_shape != result.shape): + ## FIXME: Better way of resizing the result when necessary + if (select_shape.size == result.shape): + result.reshape(select_shape) + else: + result = newTensorUninit(select_shape) + + for i, index in enumerate(indices): + var r_slice = result.atAxisIndex(axis, i) + var t_slice = t.atAxisIndex(axis, int(index)) + r_slice.copyFrom(t_slice) From 692c7d35eca63b5ca87d8eb52f8de7a0024779fd Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Thu, 24 Oct 2019 00:06:04 -0500 Subject: [PATCH 5/8] Formatting and some API changes Ran 'nimpretty' to clean up formatting / long lines, and passed more information to the nnp_crf functions. --- src/nn/layers/crf.nim | 82 +++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim index ccabb43f5..851e5b1e4 100644 --- a/src/nn/layers/crf.nim +++ b/src/nn/layers/crf.nim @@ -16,7 +16,7 @@ import strformat import options -import ../../tensor/tensor, +import ../../tensor/tensor, ../../nn_primitives/nn_primitives, ../../nn/init, ../../autograd/autograd @@ -24,17 +24,20 @@ import ../../tensor/tensor, type Idx* = SomeInteger -type CRFGate* [TT; Idx] {.final.} = ref object of Gate[TT] +type CRFGate*[TT; Idx] {.final.} = ref object of Gate[TT] ## CRF (Linear) Gate for sequence prediction. transitions: Variable[TT] num_tags: Idx - # Special values for + # Special values for state transitions bos_tag: Idx eos_tag: Idx + dims: tuple[timesteps, batch_size, hidden_dim: Idx] -proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1)): Tensor[T] = + +proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx; range_val: T = T( + 0.1)): Tensor[T] = ## Create emissions matrix within bounds [-range, range], uniformly ## distributed. The special transitions from [any, start] and [end, any] are ## set to be an arbitrarily low value to prevent prohibited transitions. @@ -43,7 +46,7 @@ proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1) ## The `num_tags` indicating how many real (non-special) tag values there are. ## The `range_val` giving the scale to initialize transition values. ## - ## Returns + ## Returns ## The initialized transitions matrix of shape [num_tags + 2, num_tags + 2] # TODO: In future, allow for rules prohibiting / mandating certain transitions. @@ -58,16 +61,16 @@ proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx, range_val: T = T(0.1) proc crf_forward[TT, Idx]( - result: var Variable[TT], - input: Variable[TT], - mask: Variable[TT], - transitions: Variable[TT], - tags: Tensor[Idx], - num_tags: int, + result: var Variable[TT]; + input: Variable[TT]; + mask: Variable[TT]; + transitions: Variable[TT]; + tags: Tensor[Idx]; + num_tags: int; reduce = false ) = - ## Compute the negative log likelihood for each input sequence. - ## If `reduce` is true, return + ## Compute the negative log likelihood for each input sequence. + ## If `reduce` is true, return var gate: CRFGate[TT, Idx] new gate @@ -77,49 +80,58 @@ proc crf_forward[TT, Idx]( gate.bos_tag = Idx(num_tags) gate.eos_tag = Idx(num_tags + 1) - let + let timesteps = input.value.shape[0] batch_size = input.value.shape[1] hidden_dim = input.value.shape[2] - + + gate.dims = (timesteps: timesteps, batch_size: batch_size, + hidden_dim: hidden_dim) + crf_forward( result.value, input.value, mask.value, transitions.value, tags, - reduce = reduce + gate.dims.timesteps, gate.dims.batch_size, gate.dims.hidden_dim, + gate.bos_tag, gate.eos_tag ) proc crf_viterbi*[TT]() = discard proc crf*[TT]( - input: Variable[TT], - mask: Variable[TT], - transitions: Variable[TT], - tags: Option[Tensor[Idx]], - num_tags: int, + input: Variable[TT]; + mask: Variable[TT]; + transitions: Variable[TT]; + tags: Option[Tensor[Idx]]; + num_tags: int; reduce: bool = false ): Variable[TT] = ## Input: - ## - An `x` Variable of shape [timesteps, batch_size, hidden_size] + ## - An `x` Variable of shape [timesteps, batch_size, num_tags] ## - A `mask` Variable of shape [timesteps, batch_size] with is_grad_needed ## set to 0. ## - A `transitions` matrix of size (num_tags + 2, num_tags + 2) ## The extra tags are for BOS / EOS tags. - ## - A `tags` tensor of shape [timesteps, batch_size, num_tags + 2] - only needed if + ## - A `tags` tensor of shape [timesteps, batch_size] - only needed if ## doing training. If not training, then this can be nil. - ## + ## ## Returns: ## - Negative log likelihood Tensor [batch_size, ] ## - Logits for tag prediction of shape [batch_size, sequence_length, num_tags] when compileOption("boundChecks"): - doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3, got shape of {input.value.shape}" - doAssert mask.value.shape[0..1] == input.value.shape[0..1], fmt"Mask and input shapes do not match:" & - fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}" - doAssert transitions.value.shape == [num_tags + 2, num_tags + 2], "Expected transitions matrix shape to " & - fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}" - + doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3" & + ", got shape of {input.value.shape}" + doAssert input.value.shape[2] == num_tags, fmt"Expected input variable to" & + " emit {num_tags}, emitted {input.value.shape[2]}" + doAssert mask.value.shape[0..1] == input.value.shape[0..1], + fmt"Mask and input shapes do not match:" & + fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}" + doAssert transitions.value.shape == [num_tags + 2, num_tags + 2], + "Expected transitions matrix shape to " & + fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}" + assert mask.requires_grad == false, "Mask should not need a gradient" new result @@ -144,7 +156,7 @@ when isMainModule: let input = ctx.variable( - randomTensor[float32](timesteps, batch_size, hidden_dim, max=1.1), + randomTensor[float32](timesteps, batch_size, hidden_dim, max = 1.1), requires_grad = true ) @@ -153,18 +165,18 @@ when isMainModule: num_tags: int = 5 transitions = ctx.variable( - (randomTensor(num_tags + 2, num_tags + 2, max=2.0'f32) .- 1.0'f32), + (randomTensor(num_tags + 2, num_tags + 2, max = 2.0'f32) .- 1.0'f32), requires_grad = false ) - + suite "Basic CRF tests": test "When pass in some(Tensor[int]) can call CRF": - var tags = option(randomTensor(timesteps, batch_size, max=num_tags)) + var tags = option(randomTensor(timesteps, batch_size, max = num_tags)) let output = crf(input, mask, transitions, tags, num_tags) assert output.value.shape == [batch_size, ], fmt"Got output shape {output.value.shape}" - + test "When pass in none(Tensor[int]) get ValueError": expect ValueError: let output2 = crf(input, mask, transitions, none(Tensor[int]), num_tags) From 1c597a49f6d45f9d1c9e3489ad12bc32d005e210 Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Thu, 24 Oct 2019 01:57:17 -0500 Subject: [PATCH 6/8] Bug fixes --- src/nn/layers/crf.nim | 23 ++++++++++++----------- src/tensor/shapeshifting.nim | 6 +++--- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim index 851e5b1e4..ad9ee8ed1 100644 --- a/src/nn/layers/crf.nim +++ b/src/nn/layers/crf.nim @@ -87,7 +87,7 @@ proc crf_forward[TT, Idx]( gate.dims = (timesteps: timesteps, batch_size: batch_size, hidden_dim: hidden_dim) - + crf_forward( result.value, input.value, @@ -122,9 +122,9 @@ proc crf*[TT]( ## - Logits for tag prediction of shape [batch_size, sequence_length, num_tags] when compileOption("boundChecks"): doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3" & - ", got shape of {input.value.shape}" + fmt", got shape of {input.value.shape}" doAssert input.value.shape[2] == num_tags, fmt"Expected input variable to" & - " emit {num_tags}, emitted {input.value.shape[2]}" + fmt" emit {num_tags}, emitted {input.value.shape[2]}" doAssert mask.value.shape[0..1] == input.value.shape[0..1], fmt"Mask and input shapes do not match:" & fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}" @@ -142,9 +142,12 @@ proc crf*[TT]( if doing_training: if tags.isNone: raise newException(ValueError, "Tags must be non-nil when training") + else: + let tags_tensor = tags.get() + result.crf_forward(input, mask, transitions, tags_tensor, num_tags) else: - let tags_tensor = tags.get() - result.crf_forward(input, mask, transitions, tags_tensor, num_tags) + # TODO: Inference time + discard when isMainModule: @@ -152,27 +155,25 @@ when isMainModule: let ctx = newContext Tensor[float32] - let (timesteps, batch_size, hidden_dim) = (8, 30, 10) + let (timesteps, batch_size, num_tags) = (8, 30, 10) let input = ctx.variable( - randomTensor[float32](timesteps, batch_size, hidden_dim, max = 1.1), + randomTensor[float32](timesteps, batch_size, num_tags, max = 1.1), requires_grad = true ) mask = ctx.variable(ones[float32](timesteps, batch_size)) - num_tags: int = 5 - transitions = ctx.variable( (randomTensor(num_tags + 2, num_tags + 2, max = 2.0'f32) .- 1.0'f32), requires_grad = false ) - + suite "Basic CRF tests": test "When pass in some(Tensor[int]) can call CRF": - var tags = option(randomTensor(timesteps, batch_size, max = num_tags)) + var tags = option(randomTensor(timesteps, batch_size, max = num_tags - 1)) let output = crf(input, mask, transitions, tags, num_tags) assert output.value.shape == [batch_size, ], fmt"Got output shape {output.value.shape}" diff --git a/src/tensor/shapeshifting.nim b/src/tensor/shapeshifting.nim index 36caa68ba..5b0ebf2dc 100644 --- a/src/tensor/shapeshifting.nim +++ b/src/tensor/shapeshifting.nim @@ -329,10 +329,10 @@ proc index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int, if (select_shape != result.shape): ## FIXME: Better way of resizing the result when necessary - if (select_shape.size == result.shape): - result.reshape(select_shape) + if (select_shape.product() == result.size()): + result = result.reshape(select_shape) else: - result = newTensorUninit(select_shape) + result = newTensorUninit[T](select_shape) for i, index in enumerate(indices): var r_slice = result.atAxisIndex(axis, i) From 1db105e2c0124d72c02c2c075c5b9a5a2503ebb8 Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Thu, 24 Oct 2019 02:01:44 -0500 Subject: [PATCH 7/8] Start on crf_forward Implementation of forward pass underway, starting with scores (non normalized log prob with emission + transition components). --- src/nn_primitives/nnp_crf.nim | 84 +++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 9 deletions(-) diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim index 6bbfac47c..58ee34259 100644 --- a/src/nn_primitives/nnp_crf.nim +++ b/src/nn_primitives/nnp_crf.nim @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import strformat + import ../tensor/tensor, math @@ -23,18 +25,80 @@ type Idx = SomeInteger proc compute_scores[T]( - input: Tensor[T], - mask: Tensor[T], - transitions: Tensor[T], - tags: Tensor[Idx] -): Tensor[T] = + result: var Tensor[T], # (B, ) - not nil + input: Tensor[T], # (T, B, num_tags) + mask: Tensor[T], # (T, B) + transitions: Tensor[T], # (num_tags + 2, num_tags + 2) + tags: Tensor[Idx], # (T, B) + timesteps, batch_size, hidden_dim: int, + bos_tag, eos_tag: Idx +) = ## Computes the un-normalized log probabilities (combination of emissions and ## transition scores at each timestep). ## ## Returns: ## - A Tensor[T] of non-normalized emission scores of shape [batch_size, ] - discard + echo (timesteps, batch_size, hidden_dim) + echo input.shape + + # Transitions from bos_tag -> tag at time = 0 for all batches + var transition_scores = index_select(transitions[bos_tag, _], axis = 1, + indices = tags[0, _].squeeze()).squeeze() + + when compileOption("boundChecks"): + doAssert result.shape == [batch_size], "Result should be of shape" & + fmt" {batch_size} but got {result.shape}" + doAssert transition_scores.shape == [batch_size], "Transition scores" & + fmt" should be of shape {batch_size} but got {transition_scores.shape}" + + # Emission scores for tag at t = 0 for all in batch + # FIXME: This is giving a value of wrong shape below + var emission_scores = input[0, _, _] + .squeeze() + .index_select(axis = 1, indices = tags[0, _].squeeze()) + .squeeze(axis=1) + + when compileOption("boundChecks"): + doAssert emission_scores.shape == [batch_size], "Emission scores should" & + fmt" be of shape {batch_size} but got {emission_scores.shape}" + + emission_scores .*= mask[0, _].squeeze() + + result += transition_scores + emission_scores + + # TODO: Optimize? + for i in 1 ..< timesteps: + let + old_tags = tags[i - 1, _].squeeze(1) + new_tags = tags[i, _].squeeze(1) + + old_mask = mask[i, _].squeeze() + new_mask = mask[i + 1, _].squeeze() + + # New emission scores + input[i, _, _].squeeze().index_select(axis=1, tags[i, _].squeeze(), + result = emission_scores) + + # New transition scores + # This is applying transtion from old -> new tag across batch + # Unoptimized version: + # for j in 0 .. batch_size: + # transition_scores[j] = transitions[old_tags[j], new_tags[j]] + transition_scores.apply3_inline(old_tags, new_tags): + transitions[y, z] + + result += transition_scores * new_mask + emission_scores * old_mask + + # Assume that masked when == 0 + let last_time_inds = (mask.sum(axis=0).squeeze() .- 1).astype(int) + let last_tags = tags.index_select(axis=0, indices=last_time_inds).squeeze() + # let last_transitions = transitions[_, eos_tag].squeeze() + + # Set transition scores to last_real_tag -> EOS_TAG across batch + transitions[_, eos_tag].squeeze().index_select(axis=0, indices=last_tags, result=transition_scores) + + result += transition_scores proc compute_log_partition_function[T](): Tensor[T] = ## Compute the partition function by using the forward algorithm to avoid @@ -42,15 +106,17 @@ proc compute_log_partition_function[T](): Tensor[T] = ## configurations. discard - proc crf_forward*[T: SomeFloat]( result: var Tensor[T], input: Tensor[T], mask: Tensor[T], transitions: Tensor[T], tags: Tensor[Idx], - reduce: bool + timesteps, batch_size, hidden_dim: int, + bos_tag, eos_tag: Idx ) = ## Computes the log likelihood of input given transitions (emissions) matrix. ## Loss should be *negative* log likelihood. - discard + result = zeros[T](batch_size) + result.compute_scores(input, mask, transitions, tags, timesteps, batch_size, + hidden_dim, bos_tag, eos_tag) From e6180619cd0e450fcae1fb24fb802524a25adaca Mon Sep 17 00:00:00 2001 From: Sloane Simmons Date: Tue, 29 Oct 2019 01:11:19 -0500 Subject: [PATCH 8/8] Update CRF Fix some bugs with CRF non-normalized score calculation (mostly making sure that not returning matrix when shouldn't when using index_select). Also fix some out-of-bounds bug due to loop over time steps. --- src/nn_primitives/nnp_crf.nim | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim index 58ee34259..8b4166bff 100644 --- a/src/nn_primitives/nnp_crf.nim +++ b/src/nn_primitives/nnp_crf.nim @@ -39,6 +39,7 @@ proc compute_scores[T]( ## Returns: ## - A Tensor[T] of non-normalized emission scores of shape [batch_size, ] + # DEBUG echo (timesteps, batch_size, hidden_dim) echo input.shape @@ -53,12 +54,12 @@ proc compute_scores[T]( fmt" should be of shape {batch_size} but got {transition_scores.shape}" # Emission scores for tag at t = 0 for all in batch - # FIXME: This is giving a value of wrong shape below - var emission_scores = input[0, _, _] - .squeeze() - .index_select(axis = 1, indices = tags[0, _].squeeze()) - .squeeze(axis=1) + # Unoptimized - simple loop + var emission_scores = newTensorUninit[input.T](batch_size) + for i in 0 ..< batch_size: + emission_scores[i] = input[0, i, tags[0, i]] + when compileOption("boundChecks"): doAssert emission_scores.shape == [batch_size], "Emission scores should" & fmt" be of shape {batch_size} but got {emission_scores.shape}" @@ -68,7 +69,7 @@ proc compute_scores[T]( result += transition_scores + emission_scores # TODO: Optimize? - for i in 1 ..< timesteps: + for i in 1 ..< timesteps - 1: let old_tags = tags[i - 1, _].squeeze(1) new_tags = tags[i, _].squeeze(1) @@ -76,9 +77,9 @@ proc compute_scores[T]( old_mask = mask[i, _].squeeze() new_mask = mask[i + 1, _].squeeze() - # New emission scores - input[i, _, _].squeeze().index_select(axis=1, tags[i, _].squeeze(), - result = emission_scores) + # New emission scores are the emission at time i for batch j to tag [i, j] + for j in 0 ..< batch_size: + emission_scores[i] = input[i, j, tags[i, j]] # New transition scores # This is applying transtion from old -> new tag across batch @@ -88,12 +89,16 @@ proc compute_scores[T]( transition_scores.apply3_inline(old_tags, new_tags): transitions[y, z] - result += transition_scores * new_mask + emission_scores * old_mask + result += (transition_scores .* new_mask) + (emission_scores .* old_mask) + + # TODO: Make sure that last transition handled correctly # Assume that masked when == 0 let last_time_inds = (mask.sum(axis=0).squeeze() .- 1).astype(int) - let last_tags = tags.index_select(axis=0, indices=last_time_inds).squeeze() - # let last_transitions = transitions[_, eos_tag].squeeze() + var last_tags = newTensorUninit[tags.T](batch_size) + + for i in 0 ..< batch_size: + last_tags[i] = tags[last_time_inds[i], i] # Set transition scores to last_real_tag -> EOS_TAG across batch transitions[_, eos_tag].squeeze().index_select(axis=0, indices=last_tags, result=transition_scores)