From f961e75cd755d3c6536359976386f0a950148654 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:02:39 +0600 Subject: [PATCH 01/94] init video swin --- keras_cv/layers/video_swin_transformer_layers.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 keras_cv/layers/video_swin_transformer_layers.py diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py new file mode 100644 index 0000000000..1756010b15 --- /dev/null +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file From 578205ad780a89f06890485e10bec4e7706a7f91 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:08:46 +0600 Subject: [PATCH 02/94] add: 3d window size computation --- .../layers/video_swin_transformer_layers.py | 144 +++++++++++++++++- 1 file changed, 143 insertions(+), 1 deletion(-) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index 1756010b15..0b81362451 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -10,4 +10,146 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. + +from functools import partial + +import numpy as np +from keras import layers + +from keras_cv.api_export import keras_cv_export +from keras_cv.backend import keras +from keras_cv.backend import ops +from keras_cv.layers import DropPath + + +def window_partition(x, window_size): + """ + Args: + x: (batch_size, depth, height, width, channel) + window_size (tuple[int]): window size + + Returns: + windows: (batch_size*num_windows, window_size*window_size, channel) + """ # noqa: E501 + + input_shape = ops.shape(x) + batch_size, depth, height, width, channel = ( + input_shape[0], + input_shape[1], + input_shape[2], + input_shape[3], + input_shape[4], + ) + + x = ops.reshape( + x, + [ + batch_size, + depth // window_size[0], + window_size[0], + height // window_size[1], + window_size[1], + width // window_size[2], + window_size[2], + channel, + ], + ) + + x = ops.transpose(x, [0, 1, 3, 5, 2, 4, 6, 7]) + windows = ops.reshape( + x, [-1, window_size[0] * window_size[1] * window_size[2], channel] + ) + + return windows + + +def window_reverse(windows, window_size, batch_size, depth, height, width): + """ + Args: + windows: (batch_size*num_windows, window_size, window_size, channel) + window_size (tuple[int]): Window size + height (int): Height of image + width (int): Width of image + + Returns: + x: (batch_size, depth, height, width, channel) + """ # noqa: E501 + x = ops.reshape( + windows, + [ + batch_size, + depth // window_size[0], + height // window_size[1], + width // window_size[2], + window_size[0], + window_size[1], + window_size[2], + -1, + ], + ) + x = ops.transpose(x, [0, 1, 4, 2, 5, 3, 6, 7]) + x = ops.reshape(x, [batch_size, depth, height, width, -1]) + return x + + +def get_window_size(x_size, window_size, shift_size=None): + """Computing window size based on: "Liu et al., + Swin Transformer: Hierarchical Vision Transformer using Shifted Windows + " + https://github.com/microsoft/Swin-Transformer + + Args: + x_size: input size. + window_size: local window size. + shift_size: window shifting size. + + Returns: + x: window_size, shift_size + """ # noqa: E501 + + use_window_size = list(window_size) + + if shift_size is not None: + use_shift_size = list(shift_size) + + for i in range(len(x_size)): + if x_size[i] <= window_size[i]: + use_window_size[i] = x_size[i] + if shift_size is not None: + use_shift_size[i] = 0 + + if shift_size is None: + return tuple(use_window_size) + else: + return tuple(use_window_size), tuple(use_shift_size) + + +def compute_mask(depth, height, width, window_size, shift_size): + img_mask = np.zeros((1, depth, height, width, 1)) + cnt = 0 + for d in ( + slice(-window_size[0]), + slice(-window_size[0], -shift_size[0]), + slice(-shift_size[0], None), + ): + for h in ( + slice(-window_size[1]), + slice(-window_size[1], -shift_size[1]), + slice(-shift_size[1], None), + ): + for w in ( + slice(-window_size[2]), + slice(-window_size[2], -shift_size[2]), + slice(-shift_size[2], None), + ): + img_mask[:, d, h, w, :] = cnt + cnt = cnt + 1 + mask_windows = window_partition(img_mask, window_size) + mask_windows = ops.squeeze(mask_windows, axis=-1) + attn_mask = ops.expand_dims(mask_windows, axis=1) - ops.expand_dims( + mask_windows, axis=2 + ) + attn_mask = ops.where(attn_mask != 0, -100.0, attn_mask) + attn_mask = ops.where(attn_mask == 0, 0.0, attn_mask) + return attn_mask \ No newline at end of file From 9817025257dc2b30ea1e7b90c3ed2d791bc6bd8f Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:10:26 +0600 Subject: [PATCH 03/94] add: mlp layer --- .../layers/video_swin_transformer_layers.py | 62 ++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index 0b81362451..dfaac439ce 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -152,4 +152,64 @@ def compute_mask(depth, height, width, window_size, shift_size): ) attn_mask = ops.where(attn_mask != 0, -100.0, attn_mask) attn_mask = ops.where(attn_mask == 0, 0.0, attn_mask) - return attn_mask \ No newline at end of file + return attn_mask + + +class MLP(layers.Layer): + """A Multilayer perceptron(MLP) layer. + + Args: + hidden_dim (int): The number of units in the hidden layer. + output_dim (int): The number of units in the output layer. + drop_rate (float): Float between 0 and 1. Fraction of the + input units to drop. + activation (str): Activation to use in the hidden layers. + Default is `"gelu"`. + + References: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) + """ # noqa: E501 + + def __init__( + self, + hidden_dim, + output_dim, + drop_rate=0.0, + activation='gelu', + **kwargs + ): + super().__init__(**kwargs) + self.output_dim = output_dim + self.hidden_dim = hidden_dim + self._activation_identifier = activation + self.drop_rate = drop_rate + self.activation = layers.Activation(self._activation_identifier) + self.fc1 = layers.Dense(self.hidden_dim) + self.fc2 = layers.Dense(self.output_dim) + self.dropout = layers.Dropout(self.drop_rate) + + def build(self, input_shape): + self.fc1.build(input_shape) + self.fc2.build((*input_shape[1:-1], self.hidden_dim)) + self.built = True + + def call(self, x, training=None): + x = self.fc1(x) + x = self.activation(x) + x = self.dropout(x, training=training) + x = self.fc2(x) + x = self.dropout(x, training=training) + return x + + def get_config(self): + config = super().get_config() + config.update( + { + "output_dim": self.output_dim, + "hidden_dim": self.hidden_dim, + "drop_rate": self.drop_rate, + 'activation': self._activation_identifier + } + ) + return config \ No newline at end of file From 3343db1a8f17a9206d715c685c615a0b788dbbed Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:18:17 +0600 Subject: [PATCH 04/94] add: patch embedding layer --- .../layers/video_swin_transformer_layers.py | 88 ++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index dfaac439ce..08a3bef735 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -170,7 +170,7 @@ class MLP(layers.Layer): - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) """ # noqa: E501 - + def __init__( self, hidden_dim, @@ -212,4 +212,90 @@ def get_config(self): 'activation': self._activation_identifier } ) + return config + + +class PatchEmbed3D(keras.Model): + """Video to Patch Embedding layer. + + Args: + patch_size (int): Patch token size. Default: (2,4,4). + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (keras.layers, optional): Normalization layer. Default: None + + References: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) + """ # noqa: E501 + + def __init__( + self, + patch_size=(2, 4, 4), + embed_dim=96, + norm_layer=None, + **kwargs + ): + super().__init__(**kwargs) + self.patch_size = patch_size + self.embed_dim = embed_dim + self.norm_layer = norm_layer + + def _compute_padding(self, dim, patch_size): + pad_amount = patch_size - (dim % patch_size) + return [ + 0, pad_amount if pad_amount != patch_size else 0 + ] + + def build(self, input_shape): + self.pads = [ + [0, 0], + self._compute_padding(input_shape[1], self.patch_size[0]), + self._compute_padding(input_shape[2], self.patch_size[1]), + self._compute_padding(input_shape[3], self.patch_size[2]), + [0, 0] + ] + + self.proj = layers.Conv3D( + self.embed_dim, + kernel_size=self.patch_size, + strides=self.patch_size, + name='embed_proj' + ) + self.proj.build((None, None, None, None, input_shape[-1])) + + self.norm = None + if self.norm_layer is not None: + self.norm = self.norm_layer( + axis=-1, epsilon=1e-5, name='embed_norm' + ) + self.norm.build( + (None, None, None, None, self.embed_dim) + ) + self.built = True + + def call(self, x): + x = ops.pad(x, self.pads) + x = self.proj(x) + + if self.norm is not None: + x = self.norm(x) + + return x + + def compute_output_shape(self, input_shape): + spatial_dims = [ + (dim - self.patch_size[i]) // self.patch_size[i] + 1 + for i, dim in enumerate(input_shape[1:-1]) + ] + output_shape = (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,) + return output_shape + + def get_config(self): + config = super().get_config() + config.update( + { + "patch_size": self.patch_size, + "embed_dim": self.embed_dim, + } + ) return config \ No newline at end of file From 7ab5cab4a78c3e3046246bff2db09b77c963bf75 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:22:42 +0600 Subject: [PATCH 05/94] add: patch merging layer --- .../layers/video_swin_transformer_layers.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index 08a3bef735..66656a7f06 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -298,4 +298,76 @@ def get_config(self): "embed_dim": self.embed_dim, } ) + return config + + +class PatchMerging(layers.Layer): + """Patch Merging Layer. + + Args: + input_dim (int): Number of input channels. + norm_layer (keras.layers, optional): Normalization layer. + Default: LayerNormalization + + References: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) + """ # noqa: E501 + + def __init__( + self, + input_dim, + norm_layer=layers.LayerNormalization, + **kwargs + ): + super().__init__(**kwargs) + self.input_dim = input_dim + self.norm_layer = norm_layer + + def build(self, input_shape): + batch_size, depth, height, width, channel = input_shape + self.reduction = layers.Dense(2 * self.input_dim, use_bias=False) + self.reduction.build((batch_size, depth, height // 2, width // 2, 4 * channel)) + self.norm = self.norm_layer(axis=-1, epsilon=1e-5) + self.norm.build((batch_size, depth, height // 2, width // 2, 4 * channel)) + self.built=True + + def call(self, x): + """ The call function. + + Args: + x: Input feature, shape: (batch_size, depth, height, width, channel). + """ + input_shape = ops.shape(x) + height, width = ( + input_shape[2], + input_shape[3], + ) + + # padding if needed + paddings = [ + [0, 0], + [0, 0], + [0, ops.mod(height, 2)], + [0, ops.mod(width, 2)], + [0, 0] + ] + x = ops.pad(x, paddings) + + x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C + x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C + x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C + x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C + x = ops.concatenate([x0, x1, x2, x3], axis=-1) # B D H/2 W/2 4*C + x = self.norm(x) + x = self.reduction(x) + return x + + def get_config(self): + config = super().get_config() + config.update( + { + "input_dim": self.input_dim, + } + ) return config \ No newline at end of file From f70a61bb3546b77d0056273faf076c2133b3ca69 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:25:34 +0600 Subject: [PATCH 06/94] add: window attention layer --- .../layers/video_swin_transformer_layers.py | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index 66656a7f06..9ea37900a6 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -370,4 +370,142 @@ def get_config(self): "input_dim": self.input_dim, } ) + return config + + +class WindowAttention3D(keras.Model): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + window_size (tuple[int]): The temporal length, height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop_rate (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.0 + + References: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) + """ # noqa: E501 + + def __init__( + self, + input_dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0., + proj_drop_rate=0., + **kwargs + ): + super().__init__(**kwargs) + # variables + self.input_dim = input_dim + self.window_size = window_size + self.num_heads = num_heads + head_dim = input_dim // num_heads + self.qk_scale = qk_scale + self.scale = qk_scale or head_dim ** -0.5 + self.qkv_bias = qkv_bias + self.attn_drop_rate = attn_drop_rate + self.proj_drop_rate = proj_drop_rate + + def get_relative_position_index(self, window_depth, window_height, window_width): + y_y, z_z, x_x = ops.meshgrid( + ops.arange(window_width), ops.arange(window_depth), ops.arange(window_height) + ) + coords = ops.stack([z_z, y_y, x_x], axis=0) + coords_flatten = ops.reshape(coords, [3, -1]) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = ops.transpose(relative_coords, axes=[1, 2, 0]) + z_z = (relative_coords[:, :, 0] + window_depth - 1) * (2 * window_height - 1) * (2 * window_width - 1) + x_x = (relative_coords[:, :, 1] + window_height - 1) * (2 * window_width - 1) + y_y = (relative_coords[:, :, 2] + window_width - 1) + relative_coords = ops.stack([z_z, x_x, y_y], axis=-1) + return ops.sum(relative_coords, axis=-1) + + def build(self, input_shape): + self.relative_position_bias_table = self.add_weight( + shape=( + (2 * self.window_size[0] - 1) * + (2 * self.window_size[1] - 1) * + (2 * self.window_size[2] - 1), + self.num_heads, + ), + initializer="zeros", + trainable=True, + name="relative_position_bias_table", + ) + self.relative_position_index = self.get_relative_position_index( + self.window_size[0], self.window_size[1], self.window_size[2] + ) + + # layers + self.qkv = layers.Dense(self.input_dim * 3, use_bias=self.qkv_bias) + self.attn_drop = layers.Dropout(self.attn_drop_rate) + self.proj = layers.Dense(self.input_dim) + self.proj_drop = layers.Dropout(self.proj_drop_rate) + self.qkv.build(input_shape) + self.proj.build(input_shape) + self.built = True + + def call(self, x, mask=None, training=None): + input_shape = ops.shape(x) + batch_size, depth, channel = ( + input_shape[0], + input_shape[1], + input_shape[2], + ) + + qkv = self.qkv(x) + qkv = ops.reshape(qkv, [batch_size, depth, 3, self.num_heads, channel // self.num_heads]) + qkv = ops.transpose(qkv, [2, 0, 3, 1, 4]) + q, k, v = ops.split(qkv, 3, axis=0) + + q = ops.squeeze(q, axis=0) * self.scale + k = ops.squeeze(k, axis=0) + v = ops.squeeze(v, axis=0) + attn = ops.matmul(q, ops.transpose(k, [0, 1, 3, 2])) + + rel_pos_bias = ops.take( + self.relative_position_bias_table, self.relative_position_index[:depth, :depth] + ) + rel_pos_bias = ops.reshape(rel_pos_bias, [depth, depth, -1]) + rel_pos_bias = ops.transpose(rel_pos_bias, [2, 0, 1]) + attn = attn + rel_pos_bias[None, ...] + + if mask is not None: + mask_size = ops.shape(mask)[0] + mask = ops.cast(mask, dtype=attn.dtype) + attn = ops.reshape( + attn, + [batch_size // mask_size, mask_size, self.num_heads, depth, depth] + ) + mask[:, None, :, :] + attn = ops.reshape(attn, [-1, self.num_heads, depth, depth]) + + attn = keras.activations.softmax(attn, axis=-1) + attn = self.attn_drop(attn, training=training) + x = ops.matmul(attn, v) + x = ops.transpose(x, [0, 2, 1, 3]) + x = ops.reshape(x, [batch_size, depth, channel]) + x = self.proj(x) + x = self.proj_drop(x, training=training) + return x + + def get_config(self): + config = super().get_config() + config.update( + { + "input_dim": self.input_dim, + "window_size": self.window_size, + "num_heads": self.num_heads, + "qk_scale": self.qk_scale, + "qkv_bias": self.qkv_bias, + "attn_drop_rate": self.attn_drop_rate, + "proj_drop_rate": self.proj_drop_rate, + } + ) return config \ No newline at end of file From 5472fc655c18a681573aaeb23a65e357a97b561f Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:28:56 +0600 Subject: [PATCH 07/94] add: basic layer for video swin --- .../layers/video_swin_transformer_layers.py | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index 9ea37900a6..f5c93f18a6 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -508,4 +508,167 @@ def get_config(self): "proj_drop_rate": self.proj_drop_rate, } ) + return config + + +class BasicLayer(keras.Model): + """A basic Swin Transformer layer for one stage. + + Args: + input_dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (tuple[int]): Local window size. Default: (1,7,7). + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization + downsample (keras.layers | None, optional): Downsample layer at the end of the layer. Default: None + + References: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) + """ # noqa: E501 + + def __init__( + self, + input_dim, + depth, + num_heads, + window_size=(1,7,7), + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=partial(layers.LayerNormalization, epsilon=1e-05), + downsample=None, + **kwargs + ): + super().__init__(**kwargs) + self.input_dim = input_dim + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.shift_size = tuple([i // 2 for i in window_size]) + self.depth = depth + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.norm_layer = norm_layer + self.downsample = downsample + + def _compute_dim_padded(self, input_dim, window_dim_size): + input_dim = ops.cast(input_dim, dtype="float32") + window_dim_size = ops.cast(window_dim_size, dtype="float32") + return ops.cast( + ops.ceil(input_dim / window_dim_size) * window_dim_size, + "int32" + ) + + def build(self, input_shape): + window_size, shift_size = get_window_size( + input_shape[1:-1], self.window_size, self.shift_size + ) + Dp = self._compute_dim_padded(input_shape[1], window_size[0]) + Hp = self._compute_dim_padded(input_shape[2], window_size[1]) + Wp = self._compute_dim_padded(input_shape[3], window_size[2]) + self.attn_mask = compute_mask( + Dp, Hp, Wp, window_size, shift_size + ) + + # build blocks + self.blocks = [ + SwinTransformerBlock3D( + self.input_dim, + num_heads=self.num_heads, + window_size=self.window_size, + shift_size=(0,0,0) if (i % 2 == 0) else self.shift_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + qk_scale=self.qk_scale, + drop_rate=self.drop_rate, + attn_drop_rate=self.attn_drop_rate, + drop_path_rate=self.drop_path_rate[i] if isinstance(self.drop_path_rate, list) else self.drop_path_rate, + norm_layer=self.norm_layer, + ) + for i in range(self.depth) + ] + + if self.downsample is not None: + self.downsample = self.downsample(input_dim=self.input_dim, norm_layer=self.norm_layer) + self.downsample.build(input_shape) + + for i in range(self.depth): + self.blocks[i].build(input_shape) + + self.built = True + + + def compute_output_shape(self, input_shape): + window_size, _ = get_window_size( + input_shape[1:-1], self.window_size, self.shift_size + ) + depth_p = self.compute_dim_padded(input_shape[1], window_size[0]) + height_p = self.compute_dim_padded(input_shape[2], window_size[1]) + width_p = self.compute_dim_padded(input_shape[3], window_size[2]) + + if self.downsample is not None: + output_shape = ( + input_shape[0], depth_p, height_p // 2, width_p // 2, 2*self.input_dim + ) + return output_shape + + return input_shape + + def call(self, x, training=None): + input_shape = ops.shape(x) + B,D,H,W,C = ( + input_shape[0], + input_shape[1], + input_shape[2], + input_shape[3], + input_shape[4], + ) + + for blk in self.blocks: + x = blk( + x, + self.attn_mask, + training=training + ) + + x = ops.reshape( + x, [B, D, H, W, -1] + ) + + if self.downsample is not None: + x = self.downsample(x) + + return x + + + def get_config(self): + config = super().get_config() + config.update( + { + "input_dim": self.input_dim, + "window_size": self.window_size, + "num_heads": self.num_heads, + "mlp_ratio": self.mlp_ratio, + "shift_size": self.shift_size, + "depth": self.depth, + "qkv_bias": self.qkv_bias, + "qk_scale": self.qk_scale, + "drop": self.drop, + "attn_drop": self.attn_drop, + "drop_path": self.drop_path + } + ) return config \ No newline at end of file From 76d444bbbd8bca3c52967b8096b557ee1c2bcd26 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:32:12 +0600 Subject: [PATCH 08/94] update: basic layer for video swin --- .../layers/video_swin_transformer_layers.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index f5c93f18a6..34315f2fd4 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -576,11 +576,11 @@ def build(self, input_shape): window_size, shift_size = get_window_size( input_shape[1:-1], self.window_size, self.shift_size ) - Dp = self._compute_dim_padded(input_shape[1], window_size[0]) - Hp = self._compute_dim_padded(input_shape[2], window_size[1]) - Wp = self._compute_dim_padded(input_shape[3], window_size[2]) + depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) + height_pad = self._compute_dim_padded(input_shape[2], window_size[1]) + width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) self.attn_mask = compute_mask( - Dp, Hp, Wp, window_size, shift_size + depth_pad, height_pad, width_pad, window_size, shift_size ) # build blocks @@ -602,7 +602,9 @@ def build(self, input_shape): ] if self.downsample is not None: - self.downsample = self.downsample(input_dim=self.input_dim, norm_layer=self.norm_layer) + self.downsample = self.downsample( + input_dim=self.input_dim, norm_layer=self.norm_layer + ) self.downsample.build(input_shape) for i in range(self.depth): @@ -629,7 +631,7 @@ def compute_output_shape(self, input_shape): def call(self, x, training=None): input_shape = ops.shape(x) - B,D,H,W,C = ( + batch_size, depth, height, width, channel = ( input_shape[0], input_shape[1], input_shape[2], @@ -637,15 +639,15 @@ def call(self, x, training=None): input_shape[4], ) - for blk in self.blocks: - x = blk( + for block in self.blocks: + x = block( x, self.attn_mask, training=training ) x = ops.reshape( - x, [B, D, H, W, -1] + x, [batch_size, depth, height, width, -1] ) if self.downsample is not None: From 715b8a39fdc0cef8f0beb3faf9d0ea1101601ff9 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 16:37:10 +0600 Subject: [PATCH 09/94] add: swin blocks for video swin --- .../layers/video_swin_transformer_layers.py | 210 ++++++++++++++++++ 1 file changed, 210 insertions(+) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index 34315f2fd4..9c78397736 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -673,4 +673,214 @@ def get_config(self): "drop_path": self.drop_path } ) + return config + + +class SwinTransformerBlock3D(keras.Model): + """Swin Transformer Block. + + Args: + input_dim (int): Number of feature channels. + num_heads (int): Number of attention heads. + window_size (tuple[int]): Window size. + shift_size (tuple[int]): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optionalc): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (keras.layers.Activation, optional): Activation layer. Default: gelu + norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization + + References: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) + """ # noqa: E501 + + def __init__( + self, + input_dim, + num_heads, + window_size=(2, 7, 7), + shift_size=(0, 0, 0), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + activation='gelu', + norm_layer=layers.LayerNormalization, + **kwargs + ): + super().__init__(**kwargs) + # variables + self.input_dim = input_dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.mlp_hidden_dim = int(input_dim * mlp_ratio) + self.norm_layer = norm_layer + self._activation_identifier = activation + + for i, (shift, window) in enumerate(zip(self.shift_size, self.window_size)): + if not (0 <= shift < window): + raise ValueError( + f"shift_size[{i}] must be in the range 0 to window_size[{i}]" + ) + + def build(self, input_shape): + self.window_size, self.shift_size = get_window_size( + input_shape[1:-1], self.window_size, self.shift_size + ) + + self.apply_cyclic_shift = False + if any(i > 0 for i in self.shift_size): + self.apply_cyclic_shift = True + + # layers + self.drop_path = DropPath(self.drop_path_rate) if self.drop_path_rate > 0. else layers.Identity() + + self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05) + self.norm1.build(input_shape) + + self.attn = WindowAttention3D( + self.input_dim, + window_size=self.window_size, + num_heads=self.num_heads, + qkv_bias=self.qkv_bias, + qk_scale=self.qk_scale, + attn_drop_rate=self.attn_drop_rate, + proj_drop_rate=self.drop_rate + ) + self.attn.build((None, None, self.input_dim)) + + self.norm2 = self.norm_layer(axis=-1, epsilon=1e-05) + self.norm2.build((*input_shape[1:-1], self.input_dim)) + + self.mlp = MLP( + output_dim=self.input_dim, + hidden_dim=self.mlp_hidden_dim, + activation=self._activation_identifier, + drop_rate=self.drop_rate + ) + self.mlp.build((*input_shape[1:-1], self.input_dim)) + self.built = True + + def first_forward(self, x, mask_matrix, training): + input_shape = ops.shape(x) + batch_size, depth, height, width, channel = ( + input_shape[0], + input_shape[1], + input_shape[2], + input_shape[3], + input_shape[4], + ) + window_size, shift_size = self.window_size, self.shift_size + x = self.norm1(x) + + # pad feature maps to multiples of window size + pad_l = pad_t = pad_d0 = 0 + pad_d1 = ops.mod(-depth + window_size[0], window_size[0]) + pad_b = ops.mod(-height + window_size[1], window_size[1]) + pad_r = ops.mod(-width + window_size[2], window_size[2]) + paddings = [[0, 0], [pad_d0, pad_d1], [pad_t, pad_b], [pad_l, pad_r], [0, 0]] + x = ops.pad(x, paddings) + + input_shape = ops.shape(x) + depth_p, height_p, width_p = ( + input_shape[1], + input_shape[2], + input_shape[3], + ) + + # cyclic shift + if self.apply_cyclic_shift: + shifted_x = ops.roll( + x, + shift=(-shift_size[0], -shift_size[1], -shift_size[2]), + axis=(1, 2, 3) + ) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition(shifted_x, window_size) + + # get attentions params + attn_windows = self.attn( + x_windows, mask=attn_mask, training=training + ) + + # reverse the swin windows + shifted_x = window_reverse( + attn_windows, window_size, batch_size, depth_p, height_p, width_p + ) + + # reverse cyclic shift + if self.apply_cyclic_shift: + x = ops.roll( + shifted_x, + shift=(shift_size[0], shift_size[1], shift_size[2]), + axis=(1, 2, 3) + ) + else: + x = shifted_x + + # pad if required + do_pad = ops.logical_or( + ops.greater(pad_d1, 0), + ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0)) + ) + x = ops.cond( + do_pad, + lambda: x[:, :depth, :height, :width, :], + lambda: x + ) + + return x + + def second_forward(self, x, training): + x = self.norm2(x) + x = self.mlp(x) + x = self.drop_path(x, training=training) + return x + + + def call(self, x, mask_matrix=None, training=None): + shortcut = x + x = self.first_forward( + x, mask_matrix, training + ) + x = shortcut + self.drop_path(x) + x = x + self.second_forward(x, training) + return x + + def get_config(self): + config = super().get_config() + config.update( + { + "input_dim": self.input_dim, + "window_size": self.num_heads, + "num_heads": self.window_size, + "shift_size": self.shift_size, + "mlp_ratio": self.mlp_ratio, + "qkv_bias": self.qkv_bias, + "qk_scale": self.qk_scale, + "drop_rate": self.drop_rate, + "attn_drop_rate": self.attn_drop_rate, + "drop_path_rate": self.drop_path_rate, + "mlp_hidden_dim": self.mlp_hidden_dim, + "activation": self._activation_identifier + } + ) return config \ No newline at end of file From 3ca00424aefc9a6dbf8a6b6dff76db84348666ee Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 17:58:26 +0600 Subject: [PATCH 10/94] create and add: video swin backbone --- .../video_swin/video_swin_backbone.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone.py diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py new file mode 100644 index 0000000000..8d40787afc --- /dev/null +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -0,0 +1,164 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import numpy as np +from functools import partial + +from keras import layers +from keras_cv.api_export import keras_cv_export +from keras_cv.backend import keras +from keras_cv.backend import ops +from keras_cv.models import utils +from keras_cv.models.backbones.backbone import Backbone +from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import ( + backbone_presets, +) +from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import ( + backbone_presets_with_weights, +) +from keras_cv.utils.python_utils import classproperty + + +@keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models") +class VideoSwinBackbone(Backbone): + def __init__( + self, + *, + include_rescaling, + input_shape, + input_tensor, + embed_dim, + patch_size, + window_size, + mlp_ratio, + patch_norm, + drop_rate, + attn_drop_rate, + drop_path_rate, + depths, + num_heads, + qkv_bias, + qk_scale, + num_classes, + **kwargs + ): + + input_spec = utils.parse_model_inputs( + input_shape, input_tensor, name="videos" + ) + + # Check that the input video is well specified. + if input_spec.shape[-3] is None or input_spec.shape[-2] is None: + raise ValueError( + "Height and width of the video must be specified" + " in `input_shape`." + ) + if input_spec.shape[-3] != input_spec.shape[-2]: + raise ValueError( + "Input video must be square i.e. the height must" + " be equal to the width in the `input_shape`" + " tuple/tensor." + ) + + x = input_spec + + if include_rescaling: + # Use common rescaling strategy across keras_cv + x = keras.layers.Rescaling(1.0 / 255.0)(x) + + norm_layer = partial(layers.LayerNormalization, epsilon=1e-05) + + x = PatchEmbed3D( + patch_size=patch_size, + embed_dim=embed_dim, + norm_layer=norm_layer if patch_norm else None, + name='PatchEmbed3D' + )(x) + + x = layers.Dropout(drop_rate, name='pos_drop')(x) + dpr = np.linspace(0., drop_path_rate, sum(depths)).tolist() + + num_layers = len(depths) + + for i in range(num_layers): + layer = BasicLayer( + input_dim=int(embed_dim * 2 ** i), + depth=depths[i], + num_heads=num_heads[i], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i < num_layers - 1) else None, + name=f'BasicLayer{i + 1}' + ) + x = layer(x) + + x = norm_layer(axis=-1, epsilon=1e-05, name='norm')(x) + x = layers.GlobalAveragePooling3D(name='gap3d')(x) + output = layers.Dense( + num_classes, use_bias=True, name='head', dtype='float32' + )(x) + super().__init__(inputs=input_spec, outputs=output, **kwargs) + + self.embed_dim = embed_dim + self.patch_size = patch_size + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.norm_layer = norm_layer + self.patch_norm = patch_norm + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.num_layers = len(depths) + self.num_heads = num_heads + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.num_classes = num_classes + self.depths = depths + + def get_config(self): + config = super().get_config() + config.update({ + "embed_dim": self.embed_dim, + "patch_norm": self.patch_norm, + "window_size": self.window_size, + "patch_size": self.patch_size, + "mlp_ratio": self.mlp_ratio, + "drop_rate": self.drop_rate, + "drop_path_rate": self.drop_path_rate, + "attn_drop_rate": self.attn_drop_rate, + "depths": self.depths, + "num_heads": self.num_heads, + "qkv_bias": self.qkv_bias, + "qk_scale": self.qk_scale, + "num_classes": self.num_classes, + }) + return config + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return copy.deepcopy(backbone_presets) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return copy.deepcopy(backbone_presets_with_weights) \ No newline at end of file From 3d845c5f8560d3dddc2887439ec5c1cdcd92379c Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 18:02:50 +0600 Subject: [PATCH 11/94] rename: video swin layers to model specific --- keras_cv/layers/video_swin_transformer_layers.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py index 9c78397736..7e87d27f1f 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_transformer_layers.py @@ -215,8 +215,11 @@ def get_config(self): return config -class PatchEmbed3D(keras.Model): - """Video to Patch Embedding layer. +@keras_cv_export( + "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers" +) +class VideoSwinPatchingAndEmbedding(keras.Model): + """Video to Patch Embedding layer for Video Swin Model. Args: patch_size (int): Patch token size. Default: (2,4,4). @@ -301,7 +304,7 @@ def get_config(self): return config -class PatchMerging(layers.Layer): +class VideoSwinPatchMerging(layers.Layer): """Patch Merging Layer. Args: @@ -373,7 +376,7 @@ def get_config(self): return config -class WindowAttention3D(keras.Model): +class VideoSwinWindowAttention(keras.Model): """Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. @@ -511,7 +514,7 @@ def get_config(self): return config -class BasicLayer(keras.Model): +class VideoSwinBasicLayer(keras.Model): """A basic Swin Transformer layer for one stage. Args: @@ -676,7 +679,7 @@ def get_config(self): return config -class SwinTransformerBlock3D(keras.Model): +class VideoSwinTransformerBlock(keras.Model): """Swin Transformer Block. Args: From 1af8bd4fdccf5065dde1921f428c641ad3dd8a31 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 18:08:44 +0600 Subject: [PATCH 12/94] update module import --- keras_cv/layers/__init__.py | 3 +++ ...swin_transformer_layers.py => video_swin_layers.py} | 4 ++-- .../models/backbones/video_swin/video_swin_backbone.py | 10 +++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) rename keras_cv/layers/{video_swin_transformer_layers.py => video_swin_layers.py} (99%) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index 0bfa2aa8ec..957f5eda3c 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -141,3 +141,6 @@ from keras_cv.layers.vit_det_layers import WindowedTransformerEncoder from keras_cv.layers.vit_det_layers import WindowPartitioning from keras_cv.layers.vit_layers import PatchingAndEmbedding +from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding +from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer +from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging \ No newline at end of file diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_layers.py similarity index 99% rename from keras_cv/layers/video_swin_transformer_layers.py rename to keras_cv/layers/video_swin_layers.py index 7e87d27f1f..4f30cb5c32 100644 --- a/keras_cv/layers/video_swin_transformer_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -320,7 +320,7 @@ class VideoSwinPatchMerging(layers.Layer): def __init__( self, input_dim, - norm_layer=layers.LayerNormalization, + norm_layer=None, **kwargs ): super().__init__(**kwargs) @@ -548,7 +548,7 @@ def __init__( drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - norm_layer=partial(layers.LayerNormalization, epsilon=1e-05), + norm_layer=None, downsample=None, **kwargs ): diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 8d40787afc..26cfd10bba 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -30,6 +30,10 @@ ) from keras_cv.utils.python_utils import classproperty +from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer +from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding +from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging + @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models") class VideoSwinBackbone(Backbone): @@ -80,7 +84,7 @@ def __init__( norm_layer = partial(layers.LayerNormalization, epsilon=1e-05) - x = PatchEmbed3D( + x = VideoSwinPatchingAndEmbedding( patch_size=patch_size, embed_dim=embed_dim, norm_layer=norm_layer if patch_norm else None, @@ -93,7 +97,7 @@ def __init__( num_layers = len(depths) for i in range(num_layers): - layer = BasicLayer( + layer = VideoSwinBasicLayer( input_dim=int(embed_dim * 2 ** i), depth=depths[i], num_heads=num_heads[i], @@ -105,7 +109,7 @@ def __init__( attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], norm_layer=norm_layer, - downsample=PatchMerging if (i < num_layers - 1) else None, + downsample=VideoSwinPatchMerging if (i < num_layers - 1) else None, name=f'BasicLayer{i + 1}' ) x = layer(x) From ed2864d5f93898d6e29d150cbda06c74cf1ac19f Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 18:10:02 +0600 Subject: [PATCH 13/94] update module import --- keras_cv/layers/video_swin_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 4f30cb5c32..3373f1bfcd 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -588,7 +588,7 @@ def build(self, input_shape): # build blocks self.blocks = [ - SwinTransformerBlock3D( + VideoSwinTransformerBlock( self.input_dim, num_heads=self.num_heads, window_size=self.window_size, From bf70fa92a091f6a6ecb74abde698bc82cd76c876 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 18:20:19 +0600 Subject: [PATCH 14/94] set class method to private usage --- keras_cv/layers/video_swin_layers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 3373f1bfcd..739fb2cf7d 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -620,9 +620,9 @@ def compute_output_shape(self, input_shape): window_size, _ = get_window_size( input_shape[1:-1], self.window_size, self.shift_size ) - depth_p = self.compute_dim_padded(input_shape[1], window_size[0]) - height_p = self.compute_dim_padded(input_shape[2], window_size[1]) - width_p = self.compute_dim_padded(input_shape[3], window_size[2]) + depth_p = self._compute_dim_padded(input_shape[1], window_size[0]) + height_p = self._compute_dim_padded(input_shape[2], window_size[1]) + width_p = self._compute_dim_padded(input_shape[3], window_size[2]) if self.downsample is not None: output_shape = ( @@ -754,7 +754,7 @@ def build(self, input_shape): self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05) self.norm1.build(input_shape) - self.attn = WindowAttention3D( + self.attn = VideoSwinWindowAttention( self.input_dim, window_size=self.window_size, num_heads=self.num_heads, From eca5023cff8c834f51e440b0fe05464edee8d4d2 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 18:26:15 +0600 Subject: [PATCH 15/94] set init params for backbone --- .../video_swin/video_swin_backbone.py | 38 ++++++++----------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 26cfd10bba..0b10f5d9db 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -19,7 +19,6 @@ from keras import layers from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras -from keras_cv.backend import ops from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import ( @@ -41,21 +40,20 @@ def __init__( self, *, include_rescaling, - input_shape, - input_tensor, - embed_dim, - patch_size, - window_size, - mlp_ratio, - patch_norm, - drop_rate, - attn_drop_rate, - drop_path_rate, - depths, - num_heads, - qkv_bias, - qk_scale, - num_classes, + input_shape=(32, 224, 224, 3), + input_tensor=None, + embed_dim=96, + patch_size=[2, 4, 4], + window_size=[8, 7, 7], + mlp_ratio=4.0, + patch_norm=True, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + qkv_bias=True, + qk_scale=None, **kwargs ): @@ -115,11 +113,7 @@ def __init__( x = layer(x) x = norm_layer(axis=-1, epsilon=1e-05, name='norm')(x) - x = layers.GlobalAveragePooling3D(name='gap3d')(x) - output = layers.Dense( - num_classes, use_bias=True, name='head', dtype='float32' - )(x) - super().__init__(inputs=input_spec, outputs=output, **kwargs) + super().__init__(inputs=input_spec, outputs=x, **kwargs) self.embed_dim = embed_dim self.patch_size = patch_size @@ -134,7 +128,6 @@ def __init__( self.num_heads = num_heads self.qkv_bias = qkv_bias self.qk_scale = qk_scale - self.num_classes = num_classes self.depths = depths def get_config(self): @@ -152,7 +145,6 @@ def get_config(self): "num_heads": self.num_heads, "qkv_bias": self.qkv_bias, "qk_scale": self.qk_scale, - "num_classes": self.num_classes, }) return config From 420e2291e8afb0985aa951fd7578b49af9407368 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 18:31:31 +0600 Subject: [PATCH 16/94] rm redundant imports --- .../video_swin/video_swin_backbone.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 0b10f5d9db..4a069b78b7 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -21,12 +21,6 @@ from keras_cv.backend import keras from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone -from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import ( - backbone_presets, -) -from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import ( - backbone_presets_with_weights, -) from keras_cv.utils.python_utils import classproperty from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer @@ -146,15 +140,4 @@ def get_config(self): "qkv_bias": self.qkv_bias, "qk_scale": self.qk_scale, }) - return config - - @classproperty - def presets(cls): - """Dictionary of preset names and configurations.""" - return copy.deepcopy(backbone_presets) - - @classproperty - def presets_with_weights(cls): - """Dictionary of preset names and configurations that include - weights.""" - return copy.deepcopy(backbone_presets_with_weights) \ No newline at end of file + return config \ No newline at end of file From f73e25b35614a83112b51118419f622ad206b15d Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 18:36:42 +0600 Subject: [PATCH 17/94] add video swin layer test cases --- keras_cv/layers/video_swin_layers_test.py | 71 +++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 keras_cv/layers/video_swin_layers_test.py diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py new file mode 100644 index 0000000000..81bedcaa1b --- /dev/null +++ b/keras_cv/layers/video_swin_layers_test.py @@ -0,0 +1,71 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from keras_cv.backend import ops +from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding +from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention +from keras_cv.tests.test_case import TestCase + + +class TestVideoSwinPatchingAndEmbedding(TestCase): + def test_patch_embedding_compute_output_shape(self): + patch_embedding_model = VideoSwinPatchingAndEmbedding( + patch_size=(2, 4, 4), embed_dim=96, norm_layer=None + ) + input_shape = (None, 16, 32, 32, 3) + output_shape = patch_embedding_model.compute_output_shape(input_shape) + expected_output_shape = (None, 8, 8, 8, 96) + self.assertEqual(output_shape, expected_output_shape) + + def test_patch_embedding_get_config(self): + patch_embedding_model = VideoSwinPatchingAndEmbedding( + patch_size=(4, 4, 4), embed_dim=96 + ) + config = patch_embedding_model.get_config() + assert isinstance(config, dict) + assert config["patch_size"] == (4, 4, 4) + assert config["embed_dim"] == 96 + + +class TestVideoSwinWindowAttention(TestCase): + @pytest.fixture + def window_attention_model(self): + return VideoSwinWindowAttention( + window_size=(2, 4, 4), + num_heads=8, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0.1, + proj_drop_rate=0.1, + ) + + def test_window_attention_output_shape(self, window_attention_model): + input_shape = (4, 10, 256) + input_array = ops.ones(input_shape) + output_shape = window_attention_model(input_array).shape + expected_output_shape = input_shape + self.assertEqual(output_shape, expected_output_shape) + + def test_window_attention_get_config(self, window_attention_model): + config = window_attention_model.get_config() + # Add assertions based on your specific requirements + assert isinstance(config, dict) + assert config["window_size"] == (2, 4, 4) + assert config["num_heads"] == 8 + assert config["qkv_bias"] is True + assert config["qk_scale"] is None + assert config["attn_drop_rate"] == 0.1 + assert config["proj_drop_rate"] == 0.1 \ No newline at end of file From 1ccf7ee04486c33747417363119ca37b833449c9 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 19:13:06 +0600 Subject: [PATCH 18/94] add: videoswin backbone aliases --- .../models/backbones/video_swin/__init__.py | 13 +++ .../video_swin/video_swin_aliases.py | 82 +++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 keras_cv/models/backbones/video_swin/__init__.py create mode 100644 keras_cv/models/backbones/video_swin/video_swin_aliases.py diff --git a/keras_cv/models/backbones/video_swin/__init__.py b/keras_cv/models/backbones/video_swin/__init__.py new file mode 100644 index 0000000000..1756010b15 --- /dev/null +++ b/keras_cv/models/backbones/video_swin/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py new file mode 100644 index 0000000000..2595011007 --- /dev/null +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -0,0 +1,82 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone +from keras_cv.utils.python_utils import classproperty + + +ALIAS_DOCSTRING = """VideoSwin{size}Backbone model. + + Reference: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) + + For transfer learning use cases, make sure to read the + [guide to transfer learning & fine-tuning](https://keras.io/guides/transfer_learning/). + + Examples: + ```python + input_data = np.ones(shape=(1, 32, 224, 224, 3)) + + # Randomly initialized backbone + model = VideoSwin{size}Backbone() + output = model(input_data) + ``` +""" # noqa: E501 + +class VideoSwinTBackbone(VideoSwinBackbone): + def __new__( + cls, + **kwargs, + ): + return VideoSwinBackbone.from_preset("videoswin_tiny", **kwargs) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + +class VideoSwinSBackbone(VideoSwinBackbone): + def __new__( + cls, + **kwargs, + ): + return VideoSwinBackbone.from_preset("videoswin_small", **kwargs) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + +class VideoSwinBBackbone(VideoSwinBackbone): + def __new__( + cls, + **kwargs, + ): + return VideoSwinBackbone.from_preset("videoswin_base", **kwargs) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + + +setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T")) +setattr(VideoSwinSBackbone, "__doc__", ALIAS_DOCSTRING.format(size="S")) +setattr(VideoSwinBBackbone, "__doc__", ALIAS_DOCSTRING.format(size="B")) \ No newline at end of file From c5d5fa2590e97cf20546de8dfd53051103b0b985 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 19:32:49 +0600 Subject: [PATCH 19/94] add: video swin backbone presets --- .../video_swin/video_swin_backbone_presets.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py new file mode 100644 index 0000000000..7d7ebec73f --- /dev/null +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -0,0 +1,49 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Video Swin model preset configurations.""" + +backbone_presets_no_weights = { + "videoswin_tiny": { + "metadata": { + "description": ( + "Video Swin backbone " # TODO: update + ), + "params": 27_850_470, + "official_name": "VideoSwinT", + "path": "video_swin", + }, + }, + + "videoswin_small": { + "metadata": { + "description": ( + "Video Swin backbone " # TODO: update + ), + "params": 49_509_078, + "official_name": "VideoSwinS", + "path": "video_swin", + }, + }, + + "videoswin_base": { + "metadata": { + "description": ( + "Video Swin backbone " # TODO: update + ), + "params": 87_638_984, + "official_name": "VideoSwinB", + "path": "video_swin", + }, + }, +} \ No newline at end of file From 27b65967cd6d46940602f2f91a5b0f5e18430b9d Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 19:40:51 +0600 Subject: [PATCH 20/94] add: video swin backbone presets test --- .../video_swin_backbone_presets_test.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py new file mode 100644 index 0000000000..9d48b475ca --- /dev/null +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py @@ -0,0 +1,54 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for loading pretrained model presets.""" + +import numpy as np +import pytest + +from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone +from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinTBackbone +from keras_cv.tests.test_case import TestCase + +@pytest.mark.large +class VideoSwinPresetSmokeTest(TestCase): + """ + A smoke test for VideoSwin presets we run continuously. + This only tests the smallest weights we have available. Run with: + `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_large` # noqa: E501 + """ + + def setUp(self): + self.input_batch = np.ones(shape=(1, 32, 224, 224, 3)) + + def test_applications_model_output(self): + model = VideoSwinBackbone() + model(self.input_batch) + + def test_applications_model_output_with_preset(self): + model = VideoSwinBackbone.from_preset("videoswin_tiny") + model(self.input_batch) + + def test_applications_model_predict(self): + model = VideoSwinTBackbone() + model.predict(self.input_batch) + + def test_preset_docstring(self): + """Check we did our docstring formatting correctly.""" + for name in VideoSwinBackbone.presets: + self.assertRegex(VideoSwinBackbone.from_preset.__doc__, name) + + def test_unknown_preset_error(self): + # Not a preset name + with self.assertRaises(ValueError): + VideoSwinBackbone.from_preset("videoswin_nonexistant") \ No newline at end of file From 814db52f5aeaa6982aa5c86ea6c987957970284c Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 19:42:18 +0600 Subject: [PATCH 21/94] update: video swin backbone presets test --- .../video_swin_backbone_presets_test.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py index 9d48b475ca..77d80bb4d3 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py @@ -22,8 +22,7 @@ @pytest.mark.large class VideoSwinPresetSmokeTest(TestCase): - """ - A smoke test for VideoSwin presets we run continuously. + """A smoke test for VideoSwin presets we run continuously. This only tests the smallest weights we have available. Run with: `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_large` # noqa: E501 """ @@ -51,4 +50,19 @@ def test_preset_docstring(self): def test_unknown_preset_error(self): # Not a preset name with self.assertRaises(ValueError): - VideoSwinBackbone.from_preset("videoswin_nonexistant") \ No newline at end of file + VideoSwinBackbone.from_preset("videoswin_nonexistant") + + +@pytest.mark.extra_large +class VideoSwinPresetFullTest(TestCase): + """Test the full enumeration of our preset. + This tests every preset for VideoSwin and is only run manually. + Run with: + `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_extra_large` # noqa: E501 + """ + + def test_load_ViTDet(self): + input_data = np.ones(shape=(1, 1024, 1024, 3)) + for preset in VideoSwinBackbone.presets: + model = VideoSwinBackbone.from_preset(preset) + model(input_data) \ No newline at end of file From cc6ac2126d06b1b77ab0c8879346243b877cbe29 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 19:51:03 +0600 Subject: [PATCH 22/94] add: video classifier task --- .../models/classification/video_classifier.py | 146 ++++++++++++++++++ .../video_classifier_presets.py | 14 ++ 2 files changed, 160 insertions(+) create mode 100644 keras_cv/models/classification/video_classifier.py create mode 100644 keras_cv/models/classification/video_classifier_presets.py diff --git a/keras_cv/models/classification/video_classifier.py b/keras_cv/models/classification/video_classifier.py new file mode 100644 index 0000000000..2d5b7f61ea --- /dev/null +++ b/keras_cv/models/classification/video_classifier.py @@ -0,0 +1,146 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Video classifier model using pooling and dense layers.""" + +import copy + +from keras_cv.api_export import keras_cv_export +from keras_cv.backend import keras +from keras_cv.models.task import Task +from keras_cv.utils.python_utils import classproperty +from keras_cv.models.backbones.backbone_presets import backbone_presets +from keras_cv.models.backbones.backbone_presets import ( + backbone_presets_with_weights, +) +from keras_cv.models.classification.video_classifier_presets import ( + classifier_presets, +) + +@keras_cv_export( + [ + "keras_cv.models.VideoClassifier", + "keras_cv.models.classification.VideoClassifier", + ] +) +class VideoClassifier(Task): + """Video classifier with pooling and dense layer prediction head. + + Args: + backbone: `keras.Model` instance, the backbone architecture of the + classifier called on the inputs. Pooling will be called on the last + dimension of the backbone output. + num_classes: int, number of classes to predict. + pooling: str, type of pooling layer. Must be one of "avg", "max". + activation: Optional `str` or callable, defaults to "softmax". The + activation function to use on the Dense layer. Set `activation=None` + to return the output logits. + + Example: + ```python + input_data = keras.ops.ones(shape=(1, 32, 224, 224, 3)) + + # Pretrained classifier (e.g., for imagenet categories) + model = keras_cv.models.VideoClassifier.from_preset( + "videoswin_tiny_imagenet_classifier", + ) + output = model(input_data) + + # Pretrained backbone + backbone = keras_cv.models.VideoSwinBackbone.from_preset( + "videoswin_tiny_imagenet", + ) + model = keras_cv.models.VideoClassifier( + backbone=backbone, + num_classes=400, + ) + output = model(input_data) + + # Randomly initialized backbone with a custom config + model = keras_cv.models.VideoClassifier( + backbone=keras_cv.models.VideoSwinBackbone(), + num_classes=400, + ) + output = model(input_data) + ``` + """ + + def __init__( + self, + backbone, + num_classes, + pooling="avg", + activation="softmax", + **kwargs, + ): + if pooling == "avg": + pooling_layer = keras.layers.GlobalAveragePooling3D(name="avg_pool") + elif pooling == "max": + pooling_layer = keras.layers.GlobalMaxPooling3D(name="max_pool") + else: + raise ValueError( + f'`pooling` must be one of "avg", "max". Received: {pooling}.' + ) + inputs = backbone.input + x = backbone(inputs) + x = pooling_layer(x) + outputs = keras.layers.Dense( + num_classes, + activation=activation, + name="predictions", + dtype='float32' + )(x) + + # Instantiate using Functional API Model constructor + super().__init__( + inputs=inputs, + outputs=outputs, + **kwargs, + ) + # All references to `self` below this line + self.backbone = backbone + self.num_classes = num_classes + self.pooling = pooling + self.activation = activation + + def get_config(self): + # Backbone serialized in `super` + config = super().get_config() + config.update( + { + "backbone": keras.layers.serialize(self.backbone), + "num_classes": self.num_classes, + "pooling": self.pooling, + "activation": self.activation, + } + ) + return config + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return copy.deepcopy({**backbone_presets, **classifier_presets}) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return copy.deepcopy( + {**backbone_presets_with_weights, **classifier_presets} + ) + + @classproperty + def backbone_presets(cls): + """Dictionary of preset names and configurations of compatible + backbones.""" + return copy.deepcopy(backbone_presets) \ No newline at end of file diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py new file mode 100644 index 0000000000..8c3d9a2d71 --- /dev/null +++ b/keras_cv/models/classification/video_classifier_presets.py @@ -0,0 +1,14 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VideoClassifier Task presets.""" \ No newline at end of file From d2d883d741063dc9fb12b36b981a635d5859e227 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 19:57:07 +0600 Subject: [PATCH 23/94] add: video swin classifier presets --- .../video_classifier_presets.py | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py index 8c3d9a2d71..2a8447bd20 100644 --- a/keras_cv/models/classification/video_classifier_presets.py +++ b/keras_cv/models/classification/video_classifier_presets.py @@ -11,4 +11,47 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""VideoClassifier Task presets.""" \ No newline at end of file +"""VideoClassifier Task presets.""" + +classifier_presets = { + "videoswin_tiny_kinetics_classifier": { + "metadata": { + "description": ( + "videoswin_tiny_kinetics " # TODO: update + ), + "params": 25_613_800, + "official_name": "VideoClassifier", + "path": "video_classifier", + }, + }, + "videoswin_small_kinetics_classifier": { + "metadata": { + "description": ( + "videoswin_small_kinetics " # TODO: update + ), + "params": 25_613_800, # TODO: update + "official_name": "VideoClassifier", + "path": "video_classifier", + }, + }, + "videoswin_base_kinetics_classifier": { + "metadata": { + "description": ( + "videoswin_base_kinetics " # TODO: update + ), + "params": 25_613_800, # TODO: update + "official_name": "VideoClassifier", + "path": "video_classifier", + }, + }, + "videoswin_base_something_something_v2_classifier": { + "metadata": { + "description": ( + "videoswin_base_something_something_v2 " # TODO: update + ), + "params": 25_613_800, # TODO: update + "official_name": "VideoClassifier", + "path": "video_classifier", + }, + }, +} \ No newline at end of file From 125b2dc7fee2d7daeafb74a7a56670fcc06e48a8 Mon Sep 17 00:00:00 2001 From: innat Date: Fri, 1 Mar 2024 20:00:38 +0600 Subject: [PATCH 24/94] run formatters --- .../models/backbones/video_swin/__init__.py | 2 +- .../video_swin/video_swin_aliases.py | 17 ++-- .../video_swin/video_swin_backbone.py | 77 ++++++++++--------- .../video_swin/video_swin_backbone_presets.py | 16 +--- .../video_swin_backbone_presets_test.py | 11 ++- 5 files changed, 62 insertions(+), 61 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/__init__.py b/keras_cv/models/backbones/video_swin/__init__.py index 1756010b15..3992ffb59a 100644 --- a/keras_cv/models/backbones/video_swin/__init__.py +++ b/keras_cv/models/backbones/video_swin/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 2595011007..e18ca41f57 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone +from keras_cv.models.backbones.video_swin.video_swin_backbone import ( + VideoSwinBackbone, +) from keras_cv.utils.python_utils import classproperty - ALIAS_DOCSTRING = """VideoSwin{size}Backbone model. Reference: @@ -37,6 +37,7 @@ ``` """ # noqa: E501 + class VideoSwinTBackbone(VideoSwinBackbone): def __new__( cls, @@ -49,7 +50,8 @@ def presets_with_weights(cls): """Dictionary of preset names and configurations that include weights.""" return cls.presets - + + class VideoSwinSBackbone(VideoSwinBackbone): def __new__( cls, @@ -62,7 +64,8 @@ def presets_with_weights(cls): """Dictionary of preset names and configurations that include weights.""" return cls.presets - + + class VideoSwinBBackbone(VideoSwinBackbone): def __new__( cls, @@ -75,8 +78,8 @@ def presets_with_weights(cls): """Dictionary of preset names and configurations that include weights.""" return cls.presets - + setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T")) setattr(VideoSwinSBackbone, "__doc__", ALIAS_DOCSTRING.format(size="S")) -setattr(VideoSwinBBackbone, "__doc__", ALIAS_DOCSTRING.format(size="B")) \ No newline at end of file +setattr(VideoSwinBBackbone, "__doc__", ALIAS_DOCSTRING.format(size="B")) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 4a069b78b7..014b39fc30 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -12,20 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -import numpy as np from functools import partial +import numpy as np from keras import layers + from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras -from keras_cv.models import utils -from keras_cv.models.backbones.backbone import Backbone -from keras_cv.utils.python_utils import classproperty - from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging +from keras_cv.models import utils +from keras_cv.models.backbones.backbone import Backbone @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models") @@ -35,12 +33,12 @@ def __init__( *, include_rescaling, input_shape=(32, 224, 224, 3), - input_tensor=None, - embed_dim=96, - patch_size=[2, 4, 4], + input_tensor=None, + embed_dim=96, + patch_size=[2, 4, 4], window_size=[8, 7, 7], mlp_ratio=4.0, - patch_norm=True, + patch_norm=True, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, @@ -48,9 +46,8 @@ def __init__( num_heads=[3, 6, 12, 24], qkv_bias=True, qk_scale=None, - **kwargs + **kwargs, ): - input_spec = utils.parse_model_inputs( input_shape, input_tensor, name="videos" ) @@ -67,7 +64,7 @@ def __init__( " be equal to the width in the `input_shape`" " tuple/tensor." ) - + x = input_spec if include_rescaling: @@ -80,17 +77,17 @@ def __init__( patch_size=patch_size, embed_dim=embed_dim, norm_layer=norm_layer if patch_norm else None, - name='PatchEmbed3D' + name="PatchEmbed3D", )(x) - x = layers.Dropout(drop_rate, name='pos_drop')(x) - dpr = np.linspace(0., drop_path_rate, sum(depths)).tolist() + x = layers.Dropout(drop_rate, name="pos_drop")(x) + dpr = np.linspace(0.0, drop_path_rate, sum(depths)).tolist() num_layers = len(depths) - + for i in range(num_layers): layer = VideoSwinBasicLayer( - input_dim=int(embed_dim * 2 ** i), + input_dim=int(embed_dim * 2**i), depth=depths[i], num_heads=num_heads[i], window_size=window_size, @@ -99,16 +96,18 @@ def __init__( qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, - drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], + drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])], norm_layer=norm_layer, - downsample=VideoSwinPatchMerging if (i < num_layers - 1) else None, - name=f'BasicLayer{i + 1}' + downsample=VideoSwinPatchMerging + if (i < num_layers - 1) + else None, + name=f"BasicLayer{i + 1}", ) x = layer(x) - x = norm_layer(axis=-1, epsilon=1e-05, name='norm')(x) + x = norm_layer(axis=-1, epsilon=1e-05, name="norm")(x) super().__init__(inputs=input_spec, outputs=x, **kwargs) - + self.embed_dim = embed_dim self.patch_size = patch_size self.window_size = window_size @@ -126,18 +125,20 @@ def __init__( def get_config(self): config = super().get_config() - config.update({ - "embed_dim": self.embed_dim, - "patch_norm": self.patch_norm, - "window_size": self.window_size, - "patch_size": self.patch_size, - "mlp_ratio": self.mlp_ratio, - "drop_rate": self.drop_rate, - "drop_path_rate": self.drop_path_rate, - "attn_drop_rate": self.attn_drop_rate, - "depths": self.depths, - "num_heads": self.num_heads, - "qkv_bias": self.qkv_bias, - "qk_scale": self.qk_scale, - }) - return config \ No newline at end of file + config.update( + { + "embed_dim": self.embed_dim, + "patch_norm": self.patch_norm, + "window_size": self.window_size, + "patch_size": self.patch_size, + "mlp_ratio": self.mlp_ratio, + "drop_rate": self.drop_rate, + "drop_path_rate": self.drop_path_rate, + "attn_drop_rate": self.attn_drop_rate, + "depths": self.depths, + "num_heads": self.num_heads, + "qkv_bias": self.qkv_bias, + "qk_scale": self.qk_scale, + } + ) + return config diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py index 7d7ebec73f..ff7827f3b8 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -16,34 +16,26 @@ backbone_presets_no_weights = { "videoswin_tiny": { "metadata": { - "description": ( - "Video Swin backbone " # TODO: update - ), + "description": ("Video Swin backbone "), # TODO: update "params": 27_850_470, "official_name": "VideoSwinT", "path": "video_swin", }, }, - "videoswin_small": { "metadata": { - "description": ( - "Video Swin backbone " # TODO: update - ), + "description": ("Video Swin backbone "), # TODO: update "params": 49_509_078, "official_name": "VideoSwinS", "path": "video_swin", }, }, - "videoswin_base": { "metadata": { - "description": ( - "Video Swin backbone " # TODO: update - ), + "description": ("Video Swin backbone "), # TODO: update "params": 87_638_984, "official_name": "VideoSwinB", "path": "video_swin", }, }, -} \ No newline at end of file +} diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py index 77d80bb4d3..88b4763204 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py @@ -16,10 +16,15 @@ import numpy as np import pytest -from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone -from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinTBackbone +from keras_cv.models.backbones.video_swin.video_swin_aliases import ( + VideoSwinTBackbone, +) +from keras_cv.models.backbones.video_swin.video_swin_backbone import ( + VideoSwinBackbone, +) from keras_cv.tests.test_case import TestCase + @pytest.mark.large class VideoSwinPresetSmokeTest(TestCase): """A smoke test for VideoSwin presets we run continuously. @@ -65,4 +70,4 @@ def test_load_ViTDet(self): input_data = np.ones(shape=(1, 1024, 1024, 3)) for preset in VideoSwinBackbone.presets: model = VideoSwinBackbone.from_preset(preset) - model(input_data) \ No newline at end of file + model(input_data) From 98273022524781ef33f08614ff597e82d64674db Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 19:12:25 +0600 Subject: [PATCH 25/94] rename module name/id" --- keras_cv/layers/video_swin_layers.py | 5 +++-- .../models/backbones/video_swin/video_swin_backbone.py | 8 +++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 739fb2cf7d..af682ea27c 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from functools import partial - import numpy as np from keras import layers @@ -679,6 +677,9 @@ def get_config(self): return config +@keras_cv_export( + "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers" +) class VideoSwinTransformerBlock(keras.Model): """Swin Transformer Block. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 014b39fc30..2d06a48a43 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -77,14 +77,12 @@ def __init__( patch_size=patch_size, embed_dim=embed_dim, norm_layer=norm_layer if patch_norm else None, - name="PatchEmbed3D", + name="videoswin_patching_and_embedding", )(x) - x = layers.Dropout(drop_rate, name="pos_drop")(x) - dpr = np.linspace(0.0, drop_path_rate, sum(depths)).tolist() + dpr = np.linspace(0.0, drop_path_rate, sum(depths)).tolist() num_layers = len(depths) - for i in range(num_layers): layer = VideoSwinBasicLayer( input_dim=int(embed_dim * 2**i), @@ -101,7 +99,7 @@ def __init__( downsample=VideoSwinPatchMerging if (i < num_layers - 1) else None, - name=f"BasicLayer{i + 1}", + name=f"videoswin_basic_layer_{i + 1}", ) x = layer(x) From 89a715aaaf44e5be4ad8a3a03d4123c46df6efc0 Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 19:22:18 +0600 Subject: [PATCH 26/94] add hard-coded normalization for include rescaling=true --- .../backbones/video_swin/video_swin_backbone.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 2d06a48a43..48d66cead0 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -16,6 +16,7 @@ import numpy as np from keras import layers +from keras_cv.backend import ops from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras @@ -71,6 +72,16 @@ def __init__( # Use common rescaling strategy across keras_cv x = keras.layers.Rescaling(1.0 / 255.0)(x) + # Video Swin scales inputs based on the standard ImageNet mean/stddev. + # Officially, Videw Swin takes tensor of [0-255] ranges. + # And use mean=[123.675, 116.28, 103.53] and + # std=[58.395, 57.12, 57.375] for normalization. + # So, if include_rescaling is set to True, then, to match with the + # official scores, following normalization should be added. + x = (x - ops.array([0.485, 0.456, 0.406], dtype=x.dtype)) / ( + ops.array([0.229, 0.224, 0.225], dtype=x.dtype) + ) + norm_layer = partial(layers.LayerNormalization, epsilon=1e-05) x = VideoSwinPatchingAndEmbedding( From 36db030b4398f4051553f6b2f084069e1b1e730c Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 19:37:17 +0600 Subject: [PATCH 27/94] add docstring for videoswin backbone --- .../video_swin/video_swin_backbone.py | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 48d66cead0..6efee5d5d6 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -29,6 +29,45 @@ @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models") class VideoSwinBackbone(Backbone): + """A Video Swin Transformer backbone model. + + Args: + input_shape (tuple[int], optional): The size of the input image in + `(depth, height, width, channel)` format. + Defaults to `(32, 224, 224, 3)`. + input_tensor (KerasTensor, optional): Output of + `keras.layers.Input()`) to use as image input for the model. + Defaults to `None`. + include_rescaling (bool, optional): Whether to rescale the inputs. If + set to `True`, inputs will be passed through a + `Rescaling(1/255.0)` layer and normalize with + mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225], + Defaults to `False`. + patch_size (int | tuple(int)): Patch size. Default: (2,4,4). + embed_dim (int): Number of linear projection output channels. + Default to 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + Default to [2, 2, 6, 2] + num_heads (tuple[int]): Number of attention head of each stage. + Default to [3, 6, 12, 24] + window_size (int): Window size. Default to [8, 7, 7]. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + Default to 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + Default to True. + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + Default to None. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + patch_norm (bool): If True, add normalization after patch embedding. + Default to False. + + References: + - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) + - [Official Code](https://github.com/SwinTransformer/Video-Swin-Transformer) + """ # noqa: E501 + def __init__( self, *, @@ -54,9 +93,9 @@ def __init__( ) # Check that the input video is well specified. - if input_spec.shape[-3] is None or input_spec.shape[-2] is None: + if input_spec.shape[-4] is None or input_spec.shape[-3] is None or input_spec.shape[-2] is None: raise ValueError( - "Height and width of the video must be specified" + "Depth, Height and width of the video must be specified" " in `input_shape`." ) if input_spec.shape[-3] != input_spec.shape[-2]: From 7aa27a4aa881f906c8c00c89a59d86053d43ef7d Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 21:51:22 +0600 Subject: [PATCH 28/94] update metadata: backbone presets no weights --- .../backbones/video_swin/video_swin_backbone_presets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py index ff7827f3b8..f1e330ea33 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -16,7 +16,7 @@ backbone_presets_no_weights = { "videoswin_tiny": { "metadata": { - "description": ("Video Swin backbone "), # TODO: update + "description": ("A tiny Video Swin backbone architecture."), "params": 27_850_470, "official_name": "VideoSwinT", "path": "video_swin", @@ -24,7 +24,7 @@ }, "videoswin_small": { "metadata": { - "description": ("Video Swin backbone "), # TODO: update + "description": ("A small Video Swin backbone architecture."), "params": 49_509_078, "official_name": "VideoSwinS", "path": "video_swin", @@ -32,7 +32,7 @@ }, "videoswin_base": { "metadata": { - "description": ("Video Swin backbone "), # TODO: update + "description": ("A base Video Swin backbone architecture."), "params": 87_638_984, "official_name": "VideoSwinB", "path": "video_swin", From 62a87032d4272580b1539478d96e3a4db0ea80b4 Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 22:10:56 +0600 Subject: [PATCH 29/94] update: backbone presets no weights test --- .../backbones/video_swin/video_swin_backbone_presets_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py index 88b4763204..80996fcbfa 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py @@ -28,7 +28,7 @@ @pytest.mark.large class VideoSwinPresetSmokeTest(TestCase): """A smoke test for VideoSwin presets we run continuously. - This only tests the smallest weights we have available. Run with: + Run with: `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_large` # noqa: E501 """ @@ -67,7 +67,7 @@ class VideoSwinPresetFullTest(TestCase): """ def test_load_ViTDet(self): - input_data = np.ones(shape=(1, 1024, 1024, 3)) + input_data = np.ones(shape=(1, 32, 224, 224, 3)) for preset in VideoSwinBackbone.presets: model = VideoSwinBackbone.from_preset(preset) model(input_data) From aad56618b101b91dada422627618508f6a521292 Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 22:38:18 +0600 Subject: [PATCH 30/94] update video swin aliases for no weights --- .../video_swin/video_swin_aliases.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index e18ca41f57..0ffeaa7ab1 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -41,43 +41,58 @@ class VideoSwinTBackbone(VideoSwinBackbone): def __new__( cls, + include_rescaling=False, **kwargs, ): + kwargs.update( + { + "include_rescaling": include_rescaling, + } + ) return VideoSwinBackbone.from_preset("videoswin_tiny", **kwargs) @classproperty - def presets_with_weights(cls): - """Dictionary of preset names and configurations that include - weights.""" - return cls.presets + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} class VideoSwinSBackbone(VideoSwinBackbone): def __new__( cls, + include_rescaling=False, **kwargs, ): + kwargs.update( + { + "include_rescaling": include_rescaling, + } + ) return VideoSwinBackbone.from_preset("videoswin_small", **kwargs) @classproperty - def presets_with_weights(cls): - """Dictionary of preset names and configurations that include - weights.""" - return cls.presets + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} class VideoSwinBBackbone(VideoSwinBackbone): def __new__( cls, + include_rescaling=False, **kwargs, ): + kwargs.update( + { + "include_rescaling": include_rescaling, + } + ) return VideoSwinBackbone.from_preset("videoswin_base", **kwargs) @classproperty - def presets_with_weights(cls): - """Dictionary of preset names and configurations that include - weights.""" - return cls.presets + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T")) From 048d85ad3b4c527ab93abb9bbfca60f67ba36b09 Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 22:53:46 +0600 Subject: [PATCH 31/94] add: video swin backbone presets with weights --- .../video_swin/video_swin_backbone_presets.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py index f1e330ea33..801c5de906 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -39,3 +39,66 @@ }, }, } + +backbone_presets_with_weights = { + "videoswin_tiny_kinetics400": { + "metadata": { + "description": ( + "A tiny Video Swin backbone architecture. " + "It is pretrained on ImageNet 1K dataset, and " + "trained on Kinetics 400 dataset." + ), + "params": 27_850_470, + "official_name": "VideoSwinT", + "path": "video_swin", + }, + }, + "videoswin_small_kinetics400": { + "metadata": { + "description": ( + "A small Video Swin backbone architecture. " + "It is pretrained on ImageNet 1K dataset, and " + "trained on Kinetics 400 dataset." + ), + "params": 49_509_078, + "official_name": "VideoSwinS", + "path": "video_swin", + }, + }, + "videoswin_base_kinetics400": { + "metadata": { + "description": ( + "A base Video Swin backbone architecture. " + "It is pretrained on ImageNet 1K dataset, and " + "trained on Kinetics 400 dataset." + ), + "params": 87_638_984, + "official_name": "VideoSwinB", + "path": "video_swin", + }, + }, + "videoswin_base_kinetics600": { + "metadata": { + "description": ( + "A base Video Swin backbone architecture. " + "It is pretrained on ImageNet 22K dataset, and " + "trained on Kinetics 600 dataset." + ), + "params": 87_638_984, + "official_name": "VideoSwinB", + "path": "video_swin", + }, + }, + "videoswin_base_something_something_v2": { + "metadata": { + "description": ( + "A base Video Swin backbone architecture. " + "It is pretrained on Kinetics 400 dataset, and " + "trained on Something Something V2 dataset." + ), + "params": 87_638_984, + "official_name": "VideoSwinB", + "path": "video_swin", + }, + }, +} From 1423e838e71486ba73d5a155c013009aa7ff9b9a Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 22:59:10 +0600 Subject: [PATCH 32/94] update: video swin aliases with weights presets --- .../video_swin/video_swin_aliases.py | 39 +++++++++++++++++-- .../video_swin/video_swin_backbone_presets.py | 5 +++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 0ffeaa7ab1..ca31cac84e 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import copy from keras_cv.models.backbones.video_swin.video_swin_backbone import ( VideoSwinBackbone, ) from keras_cv.utils.python_utils import classproperty +from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets ALIAS_DOCSTRING = """VideoSwin{size}Backbone model. @@ -54,7 +55,17 @@ def __new__( @classproperty def presets(cls): """Dictionary of preset names and configurations.""" - return {} + return { + "videoswin_tiny_kinetics400": copy.deepcopy( + backbone_presets["videoswin_tiny_kinetics400"] + ), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets class VideoSwinSBackbone(VideoSwinBackbone): @@ -73,7 +84,17 @@ def __new__( @classproperty def presets(cls): """Dictionary of preset names and configurations.""" - return {} + return { + "videoswin_small_kinetics400": copy.deepcopy( + backbone_presets["videoswin_small_kinetics400"] + ), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets class VideoSwinBBackbone(VideoSwinBackbone): @@ -92,7 +113,17 @@ def __new__( @classproperty def presets(cls): """Dictionary of preset names and configurations.""" - return {} + return { + "videoswin_base_kinetics400": copy.deepcopy( + backbone_presets["videoswin_base_kinetics400"] + ), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T")) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py index 801c5de906..d76054f1fb 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -102,3 +102,8 @@ }, }, } + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} \ No newline at end of file From 2eaf8b08d5ca60719450e93b722ccec950f027e0 Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 23:10:51 +0600 Subject: [PATCH 33/94] update video swin layer test cases --- keras_cv/layers/video_swin_layers_test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py index 81bedcaa1b..eaae3c1fa2 100644 --- a/keras_cv/layers/video_swin_layers_test.py +++ b/keras_cv/layers/video_swin_layers_test.py @@ -41,9 +41,9 @@ def test_patch_embedding_get_config(self): class TestVideoSwinWindowAttention(TestCase): - @pytest.fixture - def window_attention_model(self): - return VideoSwinWindowAttention( + + def setUp(self): + self.window_attention_model = VideoSwinWindowAttention( window_size=(2, 4, 4), num_heads=8, qkv_bias=True, @@ -52,16 +52,16 @@ def window_attention_model(self): proj_drop_rate=0.1, ) - def test_window_attention_output_shape(self, window_attention_model): + def test_window_attention_output_shape(self): input_shape = (4, 10, 256) input_array = ops.ones(input_shape) - output_shape = window_attention_model(input_array).shape + output_shape = self.window_attention_model(input_array).shape expected_output_shape = input_shape self.assertEqual(output_shape, expected_output_shape) - def test_window_attention_get_config(self, window_attention_model): - config = window_attention_model.get_config() - # Add assertions based on your specific requirements + def test_window_attention_get_config(self): + config = self.window_attention_model.get_config() + # Add assertions based on the specific requirements assert isinstance(config, dict) assert config["window_size"] == (2, 4, 4) assert config["num_heads"] == 8 From f713304465316d4219630bea75c84a4f12beb012 Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 23:32:11 +0600 Subject: [PATCH 34/94] added patch merging test --- keras_cv/layers/video_swin_layers_test.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py index eaae3c1fa2..951d46d159 100644 --- a/keras_cv/layers/video_swin_layers_test.py +++ b/keras_cv/layers/video_swin_layers_test.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest from keras_cv.backend import ops from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention +from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging from keras_cv.tests.test_case import TestCase @@ -68,4 +68,22 @@ def test_window_attention_get_config(self): assert config["qkv_bias"] is True assert config["qk_scale"] is None assert config["attn_drop_rate"] == 0.1 - assert config["proj_drop_rate"] == 0.1 \ No newline at end of file + assert config["proj_drop_rate"] == 0.1 + + +class TestVideoSwinPatchMerging(TestCase): + def setUp(self): + self.patch_merging = VideoSwinPatchMerging(input_dim=32) + + def test_output_shape(self): + input_shape = (2, 4, 32, 32, 3) + input_tensor = ops.ones(*input_shape) + output_shape = self.patch_merging(input_tensor).shape + expected_shape = ( + input_shape[0], + input_shape[1], + input_shape[2] // 2, + input_shape[3] // 2, + input_shape[4] * 4 + ) + self.assertEqual(output_shape, expected_shape) From 44dae81a76469b5842d7d5ee5790db314b3951dd Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 23:35:49 +0600 Subject: [PATCH 35/94] imported video swins presets to backbone presets list" --- keras_cv/layers/video_swin_layers.py | 3 ++- keras_cv/models/backbones/backbone_presets.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index af682ea27c..ff4dd212dc 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -887,4 +887,5 @@ def get_config(self): "activation": self._activation_identifier } ) - return config \ No newline at end of file + return config + \ No newline at end of file diff --git a/keras_cv/models/backbones/backbone_presets.py b/keras_cv/models/backbones/backbone_presets.py index 95d3ccd522..93d9595c6f 100644 --- a/keras_cv/models/backbones/backbone_presets.py +++ b/keras_cv/models/backbones/backbone_presets.py @@ -30,6 +30,7 @@ from keras_cv.models.backbones.resnet_v2 import resnet_v2_backbone_presets from keras_cv.models.backbones.vit_det import vit_det_backbone_presets from keras_cv.models.object_detection.yolo_v8 import yolo_v8_backbone_presets +from keras_cv.models.backbones.video_swin import video_swin_backbone_presets backbone_presets_no_weights = { **resnet_v1_backbone_presets.backbone_presets_no_weights, @@ -42,6 +43,7 @@ **efficientnet_lite_backbone_presets.backbone_presets_no_weights, **yolo_v8_backbone_presets.backbone_presets_no_weights, **vit_det_backbone_presets.backbone_presets_no_weights, + **video_swin_backbone_presets.backbone_presets_no_weights, } backbone_presets_with_weights = { @@ -55,6 +57,7 @@ **efficientnet_lite_backbone_presets.backbone_presets_with_weights, **yolo_v8_backbone_presets.backbone_presets_with_weights, **vit_det_backbone_presets.backbone_presets_with_weights, + **video_swin_backbone_presets.backbone_presets_with_weights, } backbone_presets = { From daca84f121a516f80f36412e2a08c38f6a89e1af Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 23:45:17 +0600 Subject: [PATCH 36/94] fix: typos" --- keras_cv/layers/video_swin_layers.py | 15 +++++++-------- .../backbones/video_swin/video_swin_backbone.py | 3 ++- .../video_swin/video_swin_backbone_presets.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index ff4dd212dc..b3eb10d5ed 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -354,7 +354,6 @@ def call(self, x): [0, 0] ] x = ops.pad(x, paddings) - x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C @@ -465,7 +464,6 @@ def call(self, x, mask=None, training=None): qkv = ops.reshape(qkv, [batch_size, depth, 3, self.num_heads, channel // self.num_heads]) qkv = ops.transpose(qkv, [2, 0, 3, 1, 4]) q, k, v = ops.split(qkv, 3, axis=0) - q = ops.squeeze(q, axis=0) * self.scale k = ops.squeeze(k, axis=0) v = ops.squeeze(v, axis=0) @@ -618,13 +616,13 @@ def compute_output_shape(self, input_shape): window_size, _ = get_window_size( input_shape[1:-1], self.window_size, self.shift_size ) - depth_p = self._compute_dim_padded(input_shape[1], window_size[0]) - height_p = self._compute_dim_padded(input_shape[2], window_size[1]) - width_p = self._compute_dim_padded(input_shape[3], window_size[2]) + depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) + height_pad = self._compute_dim_padded(input_shape[2], window_size[1]) + width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) if self.downsample is not None: output_shape = ( - input_shape[0], depth_p, height_p // 2, width_p // 2, 2*self.input_dim + input_shape[0], depth_pad, height_pad // 2, width_pad // 2, 2*self.input_dim ) return output_shape @@ -736,6 +734,7 @@ def __init__( for i, (shift, window) in enumerate(zip(self.shift_size, self.window_size)): if not (0 <= shift < window): + # TODO: Add more description. raise ValueError( f"shift_size[{i}] must be in the range 0 to window_size[{i}]" ) @@ -799,7 +798,7 @@ def first_forward(self, x, mask_matrix, training): x = ops.pad(x, paddings) input_shape = ops.shape(x) - depth_p, height_p, width_p = ( + depth_pad, height_pad, width_pad = ( input_shape[1], input_shape[2], input_shape[3], @@ -827,7 +826,7 @@ def first_forward(self, x, mask_matrix, training): # reverse the swin windows shifted_x = window_reverse( - attn_windows, window_size, batch_size, depth_p, height_p, width_p + attn_windows, window_size, batch_size, depth_pad, height_pad, width_pad ) # reverse cyclic shift diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 6efee5d5d6..baf44161cc 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -88,6 +88,7 @@ def __init__( qk_scale=None, **kwargs, ): + # Parse input specification. input_spec = utils.parse_model_inputs( input_shape, input_tensor, name="videos" ) @@ -95,7 +96,7 @@ def __init__( # Check that the input video is well specified. if input_spec.shape[-4] is None or input_spec.shape[-3] is None or input_spec.shape[-2] is None: raise ValueError( - "Depth, Height and width of the video must be specified" + "Depth, height and width of the video must be specified" " in `input_shape`." ) if input_spec.shape[-3] != input_spec.shape[-2]: diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py index d76054f1fb..bd06d137c3 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -106,4 +106,4 @@ backbone_presets = { **backbone_presets_no_weights, **backbone_presets_with_weights, -} \ No newline at end of file +} From b1a5427771dca159ac4a4e8fa054b076bb1d90d8 Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 2 Mar 2024 23:51:46 +0600 Subject: [PATCH 37/94] run formatters" --- keras_cv/models/backbones/backbone_presets.py | 2 +- .../video_swin/video_swin_aliases.py | 5 ++- .../video_swin/video_swin_backbone.py | 34 +++++++++++-------- .../video_classifier_presets.py | 22 +++++------- 4 files changed, 32 insertions(+), 31 deletions(-) diff --git a/keras_cv/models/backbones/backbone_presets.py b/keras_cv/models/backbones/backbone_presets.py index 93d9595c6f..b77163aa8f 100644 --- a/keras_cv/models/backbones/backbone_presets.py +++ b/keras_cv/models/backbones/backbone_presets.py @@ -28,9 +28,9 @@ from keras_cv.models.backbones.mobilenet_v3 import mobilenet_v3_backbone_presets from keras_cv.models.backbones.resnet_v1 import resnet_v1_backbone_presets from keras_cv.models.backbones.resnet_v2 import resnet_v2_backbone_presets +from keras_cv.models.backbones.video_swin import video_swin_backbone_presets from keras_cv.models.backbones.vit_det import vit_det_backbone_presets from keras_cv.models.object_detection.yolo_v8 import yolo_v8_backbone_presets -from keras_cv.models.backbones.video_swin import video_swin_backbone_presets backbone_presets_no_weights = { **resnet_v1_backbone_presets.backbone_presets_no_weights, diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index ca31cac84e..56db9ca743 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -13,11 +13,14 @@ # limitations under the License. import copy + from keras_cv.models.backbones.video_swin.video_swin_backbone import ( VideoSwinBackbone, ) +from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import ( + backbone_presets, +) from keras_cv.utils.python_utils import classproperty -from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets ALIAS_DOCSTRING = """VideoSwin{size}Backbone model. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index baf44161cc..a6c0868699 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -16,10 +16,10 @@ import numpy as np from keras import layers -from keras_cv.backend import ops from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras +from keras_cv.backend import ops from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging @@ -33,7 +33,7 @@ class VideoSwinBackbone(Backbone): Args: input_shape (tuple[int], optional): The size of the input image in - `(depth, height, width, channel)` format. + `(depth, height, width, channel)` format. Defaults to `(32, 224, 224, 3)`. input_tensor (KerasTensor, optional): Output of `keras.layers.Input()`) to use as image input for the model. @@ -51,18 +51,18 @@ class VideoSwinBackbone(Backbone): num_heads (tuple[int]): Number of attention head of each stage. Default to [3, 6, 12, 24] window_size (int): Window size. Default to [8, 7, 7]. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default to 4. - qkv_bias (bool): If True, add a learnable bias to query, key, value. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default to True. qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default to None. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. - patch_norm (bool): If True, add normalization after patch embedding. + patch_norm (bool): If True, add normalization after patch embedding. Default to False. - + References: - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Official Code](https://github.com/SwinTransformer/Video-Swin-Transformer) @@ -94,7 +94,11 @@ def __init__( ) # Check that the input video is well specified. - if input_spec.shape[-4] is None or input_spec.shape[-3] is None or input_spec.shape[-2] is None: + if ( + input_spec.shape[-4] is None + or input_spec.shape[-3] is None + or input_spec.shape[-2] is None + ): raise ValueError( "Depth, height and width of the video must be specified" " in `input_shape`." @@ -112,11 +116,11 @@ def __init__( # Use common rescaling strategy across keras_cv x = keras.layers.Rescaling(1.0 / 255.0)(x) - # Video Swin scales inputs based on the standard ImageNet mean/stddev. - # Officially, Videw Swin takes tensor of [0-255] ranges. - # And use mean=[123.675, 116.28, 103.53] and - # std=[58.395, 57.12, 57.375] for normalization. - # So, if include_rescaling is set to True, then, to match with the + # VideoSwin scales inputs based on the ImageNet mean/stddev. + # Officially, Videw Swin takes tensor of [0-255] ranges. + # And use mean=[123.675, 116.28, 103.53] and + # std=[58.395, 57.12, 57.375] for normalization. + # So, if include_rescaling is set to True, then, to match with the # official scores, following normalization should be added. x = (x - ops.array([0.485, 0.456, 0.406], dtype=x.dtype)) / ( ops.array([0.229, 0.224, 0.225], dtype=x.dtype) @@ -147,9 +151,9 @@ def __init__( attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])], norm_layer=norm_layer, - downsample=VideoSwinPatchMerging - if (i < num_layers - 1) - else None, + downsample=( + VideoSwinPatchMerging if (i < num_layers - 1) else None + ), name=f"videoswin_basic_layer_{i + 1}", ) x = layer(x) diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py index 2a8447bd20..384373c1f9 100644 --- a/keras_cv/models/classification/video_classifier_presets.py +++ b/keras_cv/models/classification/video_classifier_presets.py @@ -16,9 +16,7 @@ classifier_presets = { "videoswin_tiny_kinetics_classifier": { "metadata": { - "description": ( - "videoswin_tiny_kinetics " # TODO: update - ), + "description": ("videoswin_tiny_kinetics "), # TODO: update "params": 25_613_800, "official_name": "VideoClassifier", "path": "video_classifier", @@ -26,20 +24,16 @@ }, "videoswin_small_kinetics_classifier": { "metadata": { - "description": ( - "videoswin_small_kinetics " # TODO: update - ), - "params": 25_613_800, # TODO: update + "description": ("videoswin_small_kinetics "), # TODO: update + "params": 25_613_800, # TODO: update "official_name": "VideoClassifier", "path": "video_classifier", }, }, "videoswin_base_kinetics_classifier": { "metadata": { - "description": ( - "videoswin_base_kinetics " # TODO: update - ), - "params": 25_613_800, # TODO: update + "description": ("videoswin_base_kinetics "), # TODO: update + "params": 25_613_800, # TODO: update "official_name": "VideoClassifier", "path": "video_classifier", }, @@ -47,11 +41,11 @@ "videoswin_base_something_something_v2_classifier": { "metadata": { "description": ( - "videoswin_base_something_something_v2 " # TODO: update + "videoswin_base_something_something_v2 " # TODO: update ), - "params": 25_613_800, # TODO: update + "params": 25_613_800, # TODO: update "official_name": "VideoClassifier", "path": "video_classifier", }, }, -} \ No newline at end of file +} From c66673c8e3f7e8537ab72271885fb85de183b9a5 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 3 Mar 2024 00:04:55 +0600 Subject: [PATCH 38/94] fix: linting issue --- keras_cv/layers/__init__.py | 6 +++--- keras_cv/layers/video_swin_layers_test.py | 12 ++++++------ keras_cv/models/classification/video_classifier.py | 9 +++++---- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index 957f5eda3c..ae4f6007f5 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -135,12 +135,12 @@ ) from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling from keras_cv.layers.transformer_encoder import TransformerEncoder +from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer +from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding +from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging from keras_cv.layers.vit_det_layers import AddRelativePositionalEmbedding from keras_cv.layers.vit_det_layers import MultiHeadAttentionWithRelativePE from keras_cv.layers.vit_det_layers import ViTDetPatchingAndEmbedding from keras_cv.layers.vit_det_layers import WindowedTransformerEncoder from keras_cv.layers.vit_det_layers import WindowPartitioning from keras_cv.layers.vit_layers import PatchingAndEmbedding -from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding -from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer -from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging \ No newline at end of file diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py index 951d46d159..cfebaabb3b 100644 --- a/keras_cv/layers/video_swin_layers_test.py +++ b/keras_cv/layers/video_swin_layers_test.py @@ -15,8 +15,8 @@ from keras_cv.backend import ops from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding -from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging +from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention from keras_cv.tests.test_case import TestCase @@ -80,10 +80,10 @@ def test_output_shape(self): input_tensor = ops.ones(*input_shape) output_shape = self.patch_merging(input_tensor).shape expected_shape = ( - input_shape[0], - input_shape[1], - input_shape[2] // 2, - input_shape[3] // 2, - input_shape[4] * 4 + input_shape[0], + input_shape[1], + input_shape[2] // 2, + input_shape[3] // 2, + input_shape[4] * 4, ) self.assertEqual(output_shape, expected_shape) diff --git a/keras_cv/models/classification/video_classifier.py b/keras_cv/models/classification/video_classifier.py index 2d5b7f61ea..6313c76977 100644 --- a/keras_cv/models/classification/video_classifier.py +++ b/keras_cv/models/classification/video_classifier.py @@ -17,8 +17,6 @@ from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras -from keras_cv.models.task import Task -from keras_cv.utils.python_utils import classproperty from keras_cv.models.backbones.backbone_presets import backbone_presets from keras_cv.models.backbones.backbone_presets import ( backbone_presets_with_weights, @@ -26,6 +24,9 @@ from keras_cv.models.classification.video_classifier_presets import ( classifier_presets, ) +from keras_cv.models.task import Task +from keras_cv.utils.python_utils import classproperty + @keras_cv_export( [ @@ -98,7 +99,7 @@ def __init__( num_classes, activation=activation, name="predictions", - dtype='float32' + dtype="float32", )(x) # Instantiate using Functional API Model constructor @@ -143,4 +144,4 @@ def presets_with_weights(cls): def backbone_presets(cls): """Dictionary of preset names and configurations of compatible backbones.""" - return copy.deepcopy(backbone_presets) \ No newline at end of file + return copy.deepcopy(backbone_presets) From 84d4e03880622220c7862d1eb0b621e9858a75f8 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 3 Mar 2024 00:12:45 +0600 Subject: [PATCH 39/94] fix: linting issue --- keras_cv/layers/video_swin_layers.py | 419 ++++++++++++++------------- 1 file changed, 221 insertions(+), 198 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index b3eb10d5ed..bf6bcaf960 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -170,12 +170,7 @@ class MLP(layers.Layer): """ # noqa: E501 def __init__( - self, - hidden_dim, - output_dim, - drop_rate=0.0, - activation='gelu', - **kwargs + self, hidden_dim, output_dim, drop_rate=0.0, activation="gelu", **kwargs ): super().__init__(**kwargs) self.output_dim = output_dim @@ -186,7 +181,7 @@ def __init__( self.fc1 = layers.Dense(self.hidden_dim) self.fc2 = layers.Dense(self.output_dim) self.dropout = layers.Dropout(self.drop_rate) - + def build(self, input_shape): self.fc1.build(input_shape) self.fc2.build((*input_shape[1:-1], self.hidden_dim)) @@ -199,19 +194,19 @@ def call(self, x, training=None): x = self.fc2(x) x = self.dropout(x, training=training) return x - + def get_config(self): config = super().get_config() config.update( { - "output_dim": self.output_dim, + "output_dim": self.output_dim, "hidden_dim": self.hidden_dim, "drop_rate": self.drop_rate, - 'activation': self._activation_identifier + "activation": self._activation_identifier, } ) return config - + @keras_cv_export( "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers" @@ -228,13 +223,9 @@ class VideoSwinPatchingAndEmbedding(keras.Model): - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) """ # noqa: E501 - + def __init__( - self, - patch_size=(2, 4, 4), - embed_dim=96, - norm_layer=None, - **kwargs + self, patch_size=(2, 4, 4), embed_dim=96, norm_layer=None, **kwargs ): super().__init__(**kwargs) self.patch_size = patch_size @@ -243,52 +234,50 @@ def __init__( def _compute_padding(self, dim, patch_size): pad_amount = patch_size - (dim % patch_size) - return [ - 0, pad_amount if pad_amount != patch_size else 0 - ] + return [0, pad_amount if pad_amount != patch_size else 0] def build(self, input_shape): self.pads = [ - [0, 0], + [0, 0], self._compute_padding(input_shape[1], self.patch_size[0]), self._compute_padding(input_shape[2], self.patch_size[1]), self._compute_padding(input_shape[3], self.patch_size[2]), - [0, 0] + [0, 0], ] - + self.proj = layers.Conv3D( - self.embed_dim, + self.embed_dim, kernel_size=self.patch_size, - strides=self.patch_size, - name='embed_proj' + strides=self.patch_size, + name="embed_proj", ) self.proj.build((None, None, None, None, input_shape[-1])) - + self.norm = None if self.norm_layer is not None: self.norm = self.norm_layer( - axis=-1, epsilon=1e-5, name='embed_norm' - ) - self.norm.build( - (None, None, None, None, self.embed_dim) + axis=-1, epsilon=1e-5, name="embed_norm" ) + self.norm.build((None, None, None, None, self.embed_dim)) self.built = True def call(self, x): x = ops.pad(x, self.pads) x = self.proj(x) - + if self.norm is not None: - x = self.norm(x) - + x = self.norm(x) + return x - + def compute_output_shape(self, input_shape): spatial_dims = [ (dim - self.patch_size[i]) // self.patch_size[i] + 1 for i, dim in enumerate(input_shape[1:-1]) ] - output_shape = (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,) + output_shape = ( + (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,) + ) return output_shape def get_config(self): @@ -300,27 +289,22 @@ def get_config(self): } ) return config - + class VideoSwinPatchMerging(layers.Layer): """Patch Merging Layer. Args: input_dim (int): Number of input channels. - norm_layer (keras.layers, optional): Normalization layer. + norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization References: - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) """ # noqa: E501 - - def __init__( - self, - input_dim, - norm_layer=None, - **kwargs - ): + + def __init__(self, input_dim, norm_layer=None, **kwargs): super().__init__(**kwargs) self.input_dim = input_dim self.norm_layer = norm_layer @@ -328,16 +312,21 @@ def __init__( def build(self, input_shape): batch_size, depth, height, width, channel = input_shape self.reduction = layers.Dense(2 * self.input_dim, use_bias=False) - self.reduction.build((batch_size, depth, height // 2, width // 2, 4 * channel)) + self.reduction.build( + (batch_size, depth, height // 2, width // 2, 4 * channel) + ) self.norm = self.norm_layer(axis=-1, epsilon=1e-5) - self.norm.build((batch_size, depth, height // 2, width // 2, 4 * channel)) - self.built=True - + self.norm.build( + (batch_size, depth, height // 2, width // 2, 4 * channel) + ) + self.built = True + def call(self, x): - """ The call function. + """The call function. Args: - x: Input feature, shape: (batch_size, depth, height, width, channel). + x: Input feature, + shape: (batch_size, depth, height, width, channel). """ input_shape = ops.shape(x) height, width = ( @@ -347,11 +336,11 @@ def call(self, x): # padding if needed paddings = [ - [0, 0], - [0, 0], - [0, ops.mod(height, 2)], - [0, ops.mod(width, 2)], - [0, 0] + [0, 0], + [0, 0], + [0, ops.mod(height, 2)], + [0, ops.mod(width, 2)], + [0, 0], ] x = ops.pad(x, paddings) x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C @@ -362,7 +351,7 @@ def call(self, x): x = self.norm(x) x = self.reduction(x) return x - + def get_config(self): config = super().get_config() config.update( @@ -371,7 +360,7 @@ def get_config(self): } ) return config - + class VideoSwinWindowAttention(keras.Model): """Window based multi-head self attention (W-MSA) module with relative position bias. @@ -389,17 +378,17 @@ class VideoSwinWindowAttention(keras.Model): - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) """ # noqa: E501 - + def __init__( - self, - input_dim, - window_size, - num_heads, - qkv_bias=True, - qk_scale=None, - attn_drop_rate=0., - proj_drop_rate=0., - **kwargs + self, + input_dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0.0, + proj_drop_rate=0.0, + **kwargs, ): super().__init__(**kwargs) # variables @@ -408,31 +397,43 @@ def __init__( self.num_heads = num_heads head_dim = input_dim // num_heads self.qk_scale = qk_scale - self.scale = qk_scale or head_dim ** -0.5 + self.scale = qk_scale or head_dim**-0.5 self.qkv_bias = qkv_bias self.attn_drop_rate = attn_drop_rate self.proj_drop_rate = proj_drop_rate - def get_relative_position_index(self, window_depth, window_height, window_width): + def get_relative_position_index( + self, window_depth, window_height, window_width + ): y_y, z_z, x_x = ops.meshgrid( - ops.arange(window_width), ops.arange(window_depth), ops.arange(window_height) + ops.arange(window_width), + ops.arange(window_depth), + ops.arange(window_height), ) coords = ops.stack([z_z, y_y, x_x], axis=0) coords_flatten = ops.reshape(coords, [3, -1]) - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) relative_coords = ops.transpose(relative_coords, axes=[1, 2, 0]) - z_z = (relative_coords[:, :, 0] + window_depth - 1) * (2 * window_height - 1) * (2 * window_width - 1) - x_x = (relative_coords[:, :, 1] + window_height - 1) * (2 * window_width - 1) - y_y = (relative_coords[:, :, 2] + window_width - 1) + z_z = ( + (relative_coords[:, :, 0] + window_depth - 1) + * (2 * window_height - 1) + * (2 * window_width - 1) + ) + x_x = (relative_coords[:, :, 1] + window_height - 1) * ( + 2 * window_width - 1 + ) + y_y = relative_coords[:, :, 2] + window_width - 1 relative_coords = ops.stack([z_z, x_x, y_y], axis=-1) return ops.sum(relative_coords, axis=-1) def build(self, input_shape): self.relative_position_bias_table = self.add_weight( shape=( - (2 * self.window_size[0] - 1) * - (2 * self.window_size[1] - 1) * - (2 * self.window_size[2] - 1), + (2 * self.window_size[0] - 1) + * (2 * self.window_size[1] - 1) + * (2 * self.window_size[2] - 1), self.num_heads, ), initializer="zeros", @@ -442,14 +443,14 @@ def build(self, input_shape): self.relative_position_index = self.get_relative_position_index( self.window_size[0], self.window_size[1], self.window_size[2] ) - + # layers self.qkv = layers.Dense(self.input_dim * 3, use_bias=self.qkv_bias) self.attn_drop = layers.Dropout(self.attn_drop_rate) self.proj = layers.Dense(self.input_dim) self.proj_drop = layers.Dropout(self.proj_drop_rate) self.qkv.build(input_shape) - self.proj.build(input_shape) + self.proj.build(input_shape) self.built = True def call(self, x, mask=None, training=None): @@ -459,30 +460,43 @@ def call(self, x, mask=None, training=None): input_shape[1], input_shape[2], ) - + qkv = self.qkv(x) - qkv = ops.reshape(qkv, [batch_size, depth, 3, self.num_heads, channel // self.num_heads]) + qkv = ops.reshape( + qkv, + [batch_size, depth, 3, self.num_heads, channel // self.num_heads], + ) qkv = ops.transpose(qkv, [2, 0, 3, 1, 4]) q, k, v = ops.split(qkv, 3, axis=0) q = ops.squeeze(q, axis=0) * self.scale k = ops.squeeze(k, axis=0) v = ops.squeeze(v, axis=0) attn = ops.matmul(q, ops.transpose(k, [0, 1, 3, 2])) - + rel_pos_bias = ops.take( - self.relative_position_bias_table, self.relative_position_index[:depth, :depth] + self.relative_position_bias_table, + self.relative_position_index[:depth, :depth], ) rel_pos_bias = ops.reshape(rel_pos_bias, [depth, depth, -1]) rel_pos_bias = ops.transpose(rel_pos_bias, [2, 0, 1]) attn = attn + rel_pos_bias[None, ...] - + if mask is not None: mask_size = ops.shape(mask)[0] mask = ops.cast(mask, dtype=attn.dtype) - attn = ops.reshape( - attn, - [batch_size // mask_size, mask_size, self.num_heads, depth, depth] - ) + mask[:, None, :, :] + attn = ( + ops.reshape( + attn, + [ + batch_size // mask_size, + mask_size, + self.num_heads, + depth, + depth, + ], + ) + + mask[:, None, :, :] + ) attn = ops.reshape(attn, [-1, self.num_heads, depth, depth]) attn = keras.activations.softmax(attn, axis=-1) @@ -493,7 +507,7 @@ def call(self, x, mask=None, training=None): x = self.proj(x) x = self.proj_drop(x, training=training) return x - + def get_config(self): config = super().get_config() config.update( @@ -508,7 +522,7 @@ def get_config(self): } ) return config - + class VideoSwinBasicLayer(keras.Model): """A basic Swin Transformer layer for one stage. @@ -531,22 +545,22 @@ class VideoSwinBasicLayer(keras.Model): - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) """ # noqa: E501 - + def __init__( self, input_dim, depth, num_heads, - window_size=(1,7,7), - mlp_ratio=4., + window_size=(1, 7, 7), + mlp_ratio=4.0, qkv_bias=False, qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, norm_layer=None, downsample=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.input_dim = input_dim @@ -562,15 +576,14 @@ def __init__( self.drop_path_rate = drop_path_rate self.norm_layer = norm_layer self.downsample = downsample - + def _compute_dim_padded(self, input_dim, window_dim_size): input_dim = ops.cast(input_dim, dtype="float32") window_dim_size = ops.cast(window_dim_size, dtype="float32") return ops.cast( - ops.ceil(input_dim / window_dim_size) * window_dim_size, - "int32" + ops.ceil(input_dim / window_dim_size) * window_dim_size, "int32" ) - + def build(self, input_shape): window_size, shift_size = get_window_size( input_shape[1:-1], self.window_size, self.shift_size @@ -581,20 +594,24 @@ def build(self, input_shape): self.attn_mask = compute_mask( depth_pad, height_pad, width_pad, window_size, shift_size ) - + # build blocks self.blocks = [ VideoSwinTransformerBlock( self.input_dim, num_heads=self.num_heads, window_size=self.window_size, - shift_size=(0,0,0) if (i % 2 == 0) else self.shift_size, + shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias, qk_scale=self.qk_scale, drop_rate=self.drop_rate, attn_drop_rate=self.attn_drop_rate, - drop_path_rate=self.drop_path_rate[i] if isinstance(self.drop_path_rate, list) else self.drop_path_rate, + drop_path_rate=( + self.drop_path_rate[i] + if isinstance(self.drop_path_rate, list) + else self.drop_path_rate + ), norm_layer=self.norm_layer, ) for i in range(self.depth) @@ -605,13 +622,12 @@ def build(self, input_shape): input_dim=self.input_dim, norm_layer=self.norm_layer ) self.downsample.build(input_shape) - + for i in range(self.depth): self.blocks[i].build(input_shape) - + self.built = True - - + def compute_output_shape(self, input_shape): window_size, _ = get_window_size( input_shape[1:-1], self.window_size, self.shift_size @@ -619,19 +635,23 @@ def compute_output_shape(self, input_shape): depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) height_pad = self._compute_dim_padded(input_shape[2], window_size[1]) width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) - + if self.downsample is not None: output_shape = ( - input_shape[0], depth_pad, height_pad // 2, width_pad // 2, 2*self.input_dim + input_shape[0], + depth_pad, + height_pad // 2, + width_pad // 2, + 2 * self.input_dim, ) return output_shape - + return input_shape def call(self, x, training=None): input_shape = ops.shape(x) - batch_size, depth, height, width, channel = ( - input_shape[0], + batch_size, depth, height, width, _ = ( + input_shape[0], input_shape[1], input_shape[2], input_shape[3], @@ -639,22 +659,15 @@ def call(self, x, training=None): ) for block in self.blocks: - x = block( - x, - self.attn_mask, - training=training - ) + x = block(x, self.attn_mask, training=training) + + x = ops.reshape(x, [batch_size, depth, height, width, -1]) - x = ops.reshape( - x, [batch_size, depth, height, width, -1] - ) - if self.downsample is not None: x = self.downsample(x) - + return x - - + def get_config(self): config = super().get_config() config.update( @@ -669,11 +682,11 @@ def get_config(self): "qk_scale": self.qk_scale, "drop": self.drop, "attn_drop": self.attn_drop, - "drop_path": self.drop_path + "drop_path": self.drop_path, } ) return config - + @keras_cv_export( "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers" @@ -699,22 +712,22 @@ class VideoSwinTransformerBlock(keras.Model): - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer) """ # noqa: E501 - + def __init__( - self, - input_dim, - num_heads, - window_size=(2, 7, 7), + self, + input_dim, + num_heads, + window_size=(2, 7, 7), shift_size=(0, 0, 0), - mlp_ratio=4., - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - activation='gelu', - norm_layer=layers.LayerNormalization, - **kwargs + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + activation="gelu", + norm_layer=layers.LayerNormalization, + **kwargs, ): super().__init__(**kwargs) # variables @@ -732,55 +745,62 @@ def __init__( self.norm_layer = norm_layer self._activation_identifier = activation - for i, (shift, window) in enumerate(zip(self.shift_size, self.window_size)): + for i, (shift, window) in enumerate( + zip(self.shift_size, self.window_size) + ): if not (0 <= shift < window): # TODO: Add more description. raise ValueError( - f"shift_size[{i}] must be in the range 0 to window_size[{i}]" + f"shift_size[{i}] must be in the " + "range 0 to window_size[{i}]" ) def build(self, input_shape): self.window_size, self.shift_size = get_window_size( input_shape[1:-1], self.window_size, self.shift_size ) - + self.apply_cyclic_shift = False if any(i > 0 for i in self.shift_size): self.apply_cyclic_shift = True - + # layers - self.drop_path = DropPath(self.drop_path_rate) if self.drop_path_rate > 0. else layers.Identity() - + self.drop_path = ( + DropPath(self.drop_path_rate) + if self.drop_path_rate > 0.0 + else layers.Identity() + ) + self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05) self.norm1.build(input_shape) - + self.attn = VideoSwinWindowAttention( - self.input_dim, - window_size=self.window_size, - num_heads=self.num_heads, - qkv_bias=self.qkv_bias, - qk_scale=self.qk_scale, + self.input_dim, + window_size=self.window_size, + num_heads=self.num_heads, + qkv_bias=self.qkv_bias, + qk_scale=self.qk_scale, attn_drop_rate=self.attn_drop_rate, - proj_drop_rate=self.drop_rate + proj_drop_rate=self.drop_rate, ) self.attn.build((None, None, self.input_dim)) - + self.norm2 = self.norm_layer(axis=-1, epsilon=1e-05) self.norm2.build((*input_shape[1:-1], self.input_dim)) - + self.mlp = MLP( - output_dim=self.input_dim, - hidden_dim=self.mlp_hidden_dim, + output_dim=self.input_dim, + hidden_dim=self.mlp_hidden_dim, activation=self._activation_identifier, - drop_rate=self.drop_rate + drop_rate=self.drop_rate, ) self.mlp.build((*input_shape[1:-1], self.input_dim)) self.built = True - + def first_forward(self, x, mask_matrix, training): input_shape = ops.shape(x) - batch_size, depth, height, width, channel = ( - input_shape[0], + batch_size, depth, height, width, _ = ( + input_shape[0], input_shape[1], input_shape[2], input_shape[3], @@ -788,66 +808,73 @@ def first_forward(self, x, mask_matrix, training): ) window_size, shift_size = self.window_size, self.shift_size x = self.norm1(x) - + # pad feature maps to multiples of window size - pad_l = pad_t = pad_d0 = 0 + pad_l = pad_t = pad_d0 = 0 pad_d1 = ops.mod(-depth + window_size[0], window_size[0]) - pad_b = ops.mod(-height + window_size[1], window_size[1]) - pad_r = ops.mod(-width + window_size[2], window_size[2]) - paddings = [[0, 0], [pad_d0, pad_d1], [pad_t, pad_b], [pad_l, pad_r], [0, 0]] + pad_b = ops.mod(-height + window_size[1], window_size[1]) + pad_r = ops.mod(-width + window_size[2], window_size[2]) + paddings = [ + [0, 0], + [pad_d0, pad_d1], + [pad_t, pad_b], + [pad_l, pad_r], + [0, 0], + ] x = ops.pad(x, paddings) - + input_shape = ops.shape(x) - depth_pad, height_pad, width_pad = ( + depth_pad, height_pad, width_pad = ( input_shape[1], input_shape[2], input_shape[3], ) - + # cyclic shift if self.apply_cyclic_shift: shifted_x = ops.roll( - x, - shift=(-shift_size[0], -shift_size[1], -shift_size[2]), - axis=(1, 2, 3) + x, + shift=(-shift_size[0], -shift_size[1], -shift_size[2]), + axis=(1, 2, 3), ) attn_mask = mask_matrix else: shifted_x = x attn_mask = None - + # partition windows - x_windows = window_partition(shifted_x, window_size) - + x_windows = window_partition(shifted_x, window_size) + # get attentions params - attn_windows = self.attn( - x_windows, mask=attn_mask, training=training - ) + attn_windows = self.attn(x_windows, mask=attn_mask, training=training) # reverse the swin windows shifted_x = window_reverse( - attn_windows, window_size, batch_size, depth_pad, height_pad, width_pad - ) + attn_windows, + window_size, + batch_size, + depth_pad, + height_pad, + width_pad, + ) # reverse cyclic shift if self.apply_cyclic_shift: x = ops.roll( - shifted_x, - shift=(shift_size[0], shift_size[1], shift_size[2]), - axis=(1, 2, 3) + shifted_x, + shift=(shift_size[0], shift_size[1], shift_size[2]), + axis=(1, 2, 3), ) else: x = shifted_x - # pad if required + # pad if required do_pad = ops.logical_or( ops.greater(pad_d1, 0), - ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0)) + ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0)), ) x = ops.cond( - do_pad, - lambda: x[:, :depth, :height, :width, :], - lambda: x + do_pad, lambda: x[:, :depth, :height, :width, :], lambda: x ) return x @@ -857,17 +884,14 @@ def second_forward(self, x, training): x = self.mlp(x) x = self.drop_path(x, training=training) return x - def call(self, x, mask_matrix=None, training=None): shortcut = x - x = self.first_forward( - x, mask_matrix, training - ) + x = self.first_forward(x, mask_matrix, training) x = shortcut + self.drop_path(x) x = x + self.second_forward(x, training) return x - + def get_config(self): config = super().get_config() config.update( @@ -879,12 +903,11 @@ def get_config(self): "mlp_ratio": self.mlp_ratio, "qkv_bias": self.qkv_bias, "qk_scale": self.qk_scale, - "drop_rate": self.drop_rate, + "drop_rate": self.drop_rate, "attn_drop_rate": self.attn_drop_rate, "drop_path_rate": self.drop_path_rate, "mlp_hidden_dim": self.mlp_hidden_dim, - "activation": self._activation_identifier + "activation": self._activation_identifier, } ) return config - \ No newline at end of file From d126b7c9693e6a83ffee3c3eb7e65204cf3896c4 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 3 Mar 2024 15:24:45 +0600 Subject: [PATCH 40/94] fix: video swin layer test cases" --- keras_cv/layers/video_swin_layers.py | 16 +++++++++++----- keras_cv/layers/video_swin_layers_test.py | 7 ++++--- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index bf6bcaf960..17cb92ebd3 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -315,10 +315,13 @@ def build(self, input_shape): self.reduction.build( (batch_size, depth, height // 2, width // 2, 4 * channel) ) - self.norm = self.norm_layer(axis=-1, epsilon=1e-5) - self.norm.build( - (batch_size, depth, height // 2, width // 2, 4 * channel) - ) + + self.norm = None + if self.norm_layer is not None: + self.norm = self.norm_layer(axis=-1, epsilon=1e-5) + self.norm.build( + (batch_size, depth, height // 2, width // 2, 4 * channel) + ) self.built = True def call(self, x): @@ -348,7 +351,10 @@ def call(self, x): x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C x = ops.concatenate([x0, x1, x2, x3], axis=-1) # B D H/2 W/2 4*C - x = self.norm(x) + + if self.norm is not None: + x = self.norm(x) + x = self.reduction(x) return x diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py index cfebaabb3b..862c2e3a77 100644 --- a/keras_cv/layers/video_swin_layers_test.py +++ b/keras_cv/layers/video_swin_layers_test.py @@ -44,6 +44,7 @@ class TestVideoSwinWindowAttention(TestCase): def setUp(self): self.window_attention_model = VideoSwinWindowAttention( + input_dim=32, window_size=(2, 4, 4), num_heads=8, qkv_bias=True, @@ -53,7 +54,7 @@ def setUp(self): ) def test_window_attention_output_shape(self): - input_shape = (4, 10, 256) + input_shape = (2, 16, 32) input_array = ops.ones(input_shape) output_shape = self.window_attention_model(input_array).shape expected_output_shape = input_shape @@ -77,13 +78,13 @@ def setUp(self): def test_output_shape(self): input_shape = (2, 4, 32, 32, 3) - input_tensor = ops.ones(*input_shape) + input_tensor = ops.ones(input_shape) output_shape = self.patch_merging(input_tensor).shape expected_shape = ( input_shape[0], input_shape[1], input_shape[2] // 2, input_shape[3] // 2, - input_shape[4] * 4, + 2 * 32, ) self.assertEqual(output_shape, expected_shape) From 61303be1a21b4647e5b3735f62decd1e75aadb59 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 3 Mar 2024 16:19:02 +0600 Subject: [PATCH 41/94] add: video swin backbone test --- .../video_swin/video_swin_backbone_test.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone_test.py diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py new file mode 100644 index 0000000000..f3b99be0b5 --- /dev/null +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -0,0 +1,60 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import pytest + +from keras_cv.backend import keras +from keras_cv.backend import ops +from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinSBackbone +from keras_cv.tests.test_case import TestCase + +class TestViTDetBackbone(TestCase): + @pytest.mark.large + def test_call(self): + model = VideoSwinSBackbone() + x = np.ones((1, 32, 224, 224, 3)) + x_out = ops.convert_to_numpy(model(x)) + num_parameters = sum( + np.prod(tuple(x.shape)) for x in model.trainable_variables + ) + self.assertEqual(x_out.shape, (1, 16, 7, 7, 768)) + self.assertEqual(num_parameters, 49_509_078) + + @pytest.mark.extra_large + def teat_save(self): + # saving test + model = VideoSwinSBackbone() + x = np.ones((1, 32, 224, 224, 3)) + x_out = ops.convert_to_numpy(model(x)) + path = os.path.join(self.get_temp_dir(), "model.keras") + model.save(path) + loaded_model = keras.saving.load_model(path) + x_out_loaded = ops.convert_to_numpy(loaded_model(x)) + self.assertAllClose(x_out, x_out_loaded) + + @pytest.mark.extra_large + def test_fit(self): + model = VideoSwinSBackbone() + x = np.ones((1, 32, 224, 224, 3)) + y = np.zeros((1, 16, 7, 7, 768)) + model.compile(optimizer="adam", loss="mse", metrics=["mse"]) + model.fit(x, y, epochs=1) + + def test_pyramid_level_inputs_error(self): + model = VideoSwinSBackbone() + with self.assertRaises(NotImplementedError, msg="doesn't compute"): + model.pyramid_level_inputs From af5878cb79520ca4e18b952cc213022a50c354a3 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 3 Mar 2024 17:37:30 +0600 Subject: [PATCH 42/94] rm redundant code --- keras_cv/layers/video_swin_layers.py | 2 +- .../video_swin/video_swin_backbone_test.py | 5 ++++- .../classification/video_classifier_presets.py | 14 +++++++------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 17cb92ebd3..09766c95b2 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -97,7 +97,7 @@ def get_window_size(x_size, window_size, shift_size=None): " https://github.com/microsoft/Swin-Transformer - Args: + Args: x_size: input size. window_size: local window size. shift_size: window shifting size. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index f3b99be0b5..9ad7fd32b6 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -19,9 +19,12 @@ from keras_cv.backend import keras from keras_cv.backend import ops -from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinSBackbone +from keras_cv.models.backbones.video_swin.video_swin_aliases import ( + VideoSwinSBackbone, +) from keras_cv.tests.test_case import TestCase + class TestViTDetBackbone(TestCase): @pytest.mark.large def test_call(self): diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py index 384373c1f9..e4c6ca825a 100644 --- a/keras_cv/models/classification/video_classifier_presets.py +++ b/keras_cv/models/classification/video_classifier_presets.py @@ -16,7 +16,7 @@ classifier_presets = { "videoswin_tiny_kinetics_classifier": { "metadata": { - "description": ("videoswin_tiny_kinetics "), # TODO: update + "description": ("videoswin_tiny_kinetics "), "params": 25_613_800, "official_name": "VideoClassifier", "path": "video_classifier", @@ -24,16 +24,16 @@ }, "videoswin_small_kinetics_classifier": { "metadata": { - "description": ("videoswin_small_kinetics "), # TODO: update - "params": 25_613_800, # TODO: update + "description": ("videoswin_small_kinetics "), + "params": 25_613_800, "official_name": "VideoClassifier", "path": "video_classifier", }, }, "videoswin_base_kinetics_classifier": { "metadata": { - "description": ("videoswin_base_kinetics "), # TODO: update - "params": 25_613_800, # TODO: update + "description": ("videoswin_base_kinetics "), + "params": 25_613_800, "official_name": "VideoClassifier", "path": "video_classifier", }, @@ -41,9 +41,9 @@ "videoswin_base_something_something_v2_classifier": { "metadata": { "description": ( - "videoswin_base_something_something_v2 " # TODO: update + "videoswin_base_something_something_v2 " ), - "params": 25_613_800, # TODO: update + "params": 25_613_800, "official_name": "VideoClassifier", "path": "video_classifier", }, From ffe457c63e77e371eaf4611e6e52130c610fb273 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 4 Mar 2024 15:52:38 +0600 Subject: [PATCH 43/94] disable preset test temporary --- .../backbones/video_swin/video_swin_backbone_presets_test.py | 5 +++++ .../models/backbones/video_swin/video_swin_backbone_test.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py index 80996fcbfa..1a3a5519c4 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py @@ -36,19 +36,23 @@ def setUp(self): self.input_batch = np.ones(shape=(1, 32, 224, 224, 3)) def test_applications_model_output(self): + self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinBackbone() model(self.input_batch) def test_applications_model_output_with_preset(self): + self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinBackbone.from_preset("videoswin_tiny") model(self.input_batch) def test_applications_model_predict(self): + self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinTBackbone() model.predict(self.input_batch) def test_preset_docstring(self): """Check we did our docstring formatting correctly.""" + self.skipTest("TODO: Enable after Kaggle model is public") for name in VideoSwinBackbone.presets: self.assertRegex(VideoSwinBackbone.from_preset.__doc__, name) @@ -67,6 +71,7 @@ class VideoSwinPresetFullTest(TestCase): """ def test_load_ViTDet(self): + self.skipTest("TODO: Enable after Kaggle model is public") input_data = np.ones(shape=(1, 32, 224, 224, 3)) for preset in VideoSwinBackbone.presets: model = VideoSwinBackbone.from_preset(preset) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 9ad7fd32b6..4ab2787687 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -28,6 +28,7 @@ class TestViTDetBackbone(TestCase): @pytest.mark.large def test_call(self): + self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinSBackbone() x = np.ones((1, 32, 224, 224, 3)) x_out = ops.convert_to_numpy(model(x)) @@ -39,6 +40,7 @@ def test_call(self): @pytest.mark.extra_large def teat_save(self): + self.skipTest("TODO: Enable after Kaggle model is public") # saving test model = VideoSwinSBackbone() x = np.ones((1, 32, 224, 224, 3)) @@ -51,6 +53,7 @@ def teat_save(self): @pytest.mark.extra_large def test_fit(self): + self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinSBackbone() x = np.ones((1, 32, 224, 224, 3)) y = np.zeros((1, 16, 7, 7, 768)) @@ -58,6 +61,7 @@ def test_fit(self): model.fit(x, y, epochs=1) def test_pyramid_level_inputs_error(self): + self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinSBackbone() with self.assertRaises(NotImplementedError, msg="doesn't compute"): model.pyramid_level_inputs From f8d3e26a4a5ebb3740beb6601bd7e310b0384685 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 4 Mar 2024 16:56:45 +0600 Subject: [PATCH 44/94] set include rescale to true --- keras_cv/models/backbones/video_swin/video_swin_aliases.py | 6 +++--- keras_cv/models/backbones/video_swin/video_swin_backbone.py | 4 ++-- keras_cv/models/classification/video_classifier_presets.py | 4 +--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 56db9ca743..57ccf227dc 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -45,7 +45,7 @@ class VideoSwinTBackbone(VideoSwinBackbone): def __new__( cls, - include_rescaling=False, + include_rescaling=True, **kwargs, ): kwargs.update( @@ -74,7 +74,7 @@ def presets_with_weights(cls): class VideoSwinSBackbone(VideoSwinBackbone): def __new__( cls, - include_rescaling=False, + include_rescaling=True, **kwargs, ): kwargs.update( @@ -103,7 +103,7 @@ def presets_with_weights(cls): class VideoSwinBBackbone(VideoSwinBackbone): def __new__( cls, - include_rescaling=False, + include_rescaling=True, **kwargs, ): kwargs.update( diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index a6c0868699..ac4e9a07ab 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -32,11 +32,11 @@ class VideoSwinBackbone(Backbone): """A Video Swin Transformer backbone model. Args: - input_shape (tuple[int], optional): The size of the input image in + input_shape (tuple[int], optional): The size of the input video in `(depth, height, width, channel)` format. Defaults to `(32, 224, 224, 3)`. input_tensor (KerasTensor, optional): Output of - `keras.layers.Input()`) to use as image input for the model. + `keras.layers.Input()`) to use as video input for the model. Defaults to `None`. include_rescaling (bool, optional): Whether to rescale the inputs. If set to `True`, inputs will be passed through a diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py index e4c6ca825a..9914e13433 100644 --- a/keras_cv/models/classification/video_classifier_presets.py +++ b/keras_cv/models/classification/video_classifier_presets.py @@ -40,9 +40,7 @@ }, "videoswin_base_something_something_v2_classifier": { "metadata": { - "description": ( - "videoswin_base_something_something_v2 " - ), + "description": ("videoswin_base_something_something_v2 "), "params": 25_613_800, "official_name": "VideoClassifier", "path": "video_classifier", From 1d0ad36a76a6cdc0a86b54a9e8c47946d563b783 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 4 Mar 2024 17:03:10 +0600 Subject: [PATCH 45/94] add video swin components to __init__ --- keras_cv/models/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py index 77c3ad33d9..0ef9b58d2d 100644 --- a/keras_cv/models/__init__.py +++ b/keras_cv/models/__init__.py @@ -179,11 +179,24 @@ ResNetV2Backbone, ) from keras_cv.models.backbones.vgg16.vgg16_backbone import VGG16Backbone +from keras_cv.models.backbones.video_swin.video_swin_aliases import ( + VideoSwinBBackbone, +) +from keras_cv.models.backbones.video_swin.video_swin_aliases import ( + VideoSwinSBackbone, +) +from keras_cv.models.backbones.video_swin.video_swin_aliases import ( + VideoSwinTBackbone, +) +from keras_cv.models.backbones.video_swin.video_swin_backbone import ( + VideoSwinBackbone, +) from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetBBackbone from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetHBackbone from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetLBackbone from keras_cv.models.backbones.vit_det.vit_det_backbone import ViTDetBackbone from keras_cv.models.classification.image_classifier import ImageClassifier +from keras_cv.models.classification.video_classifier import VideoClassifier from keras_cv.models.feature_extractor.clip import CLIP from keras_cv.models.object_detection.retinanet.retinanet import RetinaNet from keras_cv.models.object_detection.yolo_v8.yolo_v8_backbone import ( From 838a50608f93f9a661a56e1a822bb16f78cedf56 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 12:08:14 +0600 Subject: [PATCH 46/94] update docstrings: video siwn layers scripts --- keras_cv/layers/video_swin_layers.py | 60 ++++++++++++++++++---------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 09766c95b2..d651e31e00 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -22,13 +22,17 @@ def window_partition(x, window_size): - """ + """Partitions the input tensor into windows of specified size. + Args: - x: (batch_size, depth, height, width, channel) - window_size (tuple[int]): window size + x (Tensor): Input tensor of shape `(batch_size, depth, height, width, channel)`. + window_size (tuple[int]): Size of the window in each dimension (depth, height, width). Returns: - windows: (batch_size*num_windows, window_size*window_size, channel) + Tensor: Windows of shape `(batch_size*num_windows, window_size*window_size, channel)`, + where `num_windows = ( + depth//window_size[0]) * (height//window_size[1]) * (width//window_size[2] + )`. """ # noqa: E501 input_shape = ops.shape(x) @@ -63,15 +67,18 @@ def window_partition(x, window_size): def window_reverse(windows, window_size, batch_size, depth, height, width): - """ + """Reconstructs the original tensor from windows of specified size. + Args: - windows: (batch_size*num_windows, window_size, window_size, channel) - window_size (tuple[int]): Window size - height (int): Height of image - width (int): Width of image + windows (Tensor): Windows of shape `(batch_size*num_windows, window_size, window_size, channel)`. + window_size (tuple[int]): Size of the window in each dimension `(depth, height, width)`. + batch_size (int): Batch size. + depth (int): Depth of the original tensor. + height (int): Height of the original tensor. + width (int): Width of the original tensor. Returns: - x: (batch_size, depth, height, width, channel) + Tensor: Reconstructed tensor of shape `(batch_size, depth, height, width, channel)`. """ # noqa: E501 x = ops.reshape( windows, @@ -124,6 +131,26 @@ def get_window_size(x_size, window_size, shift_size=None): def compute_mask(depth, height, width, window_size, shift_size): + """Computes attention mask for sliding window self-attention mechanism. + + Args: + depth (int): Depth of the input video. + height (int): Height of the input video. + width (int): Width of the input video. + window_size (tuple[int]): Size of the sliding window in each dimension (depth, height, width). + shift_size (tuple[int]): Size of the shifting step in each dimension (depth, height, width). + + Returns: + Tensor: Attention mask of shape `(batch_size, num_windows, num_windows)`, + where `num_windows = ( + (depth - window_size[0]) // shift_size[0] + 1 + ) * ( + (height - window_size[1]) // shift_size[1] + 1 + ) * ( + (width - window_size[2]) // shift_size[2] + 1 + )`. + + """ img_mask = np.zeros((1, depth, height, width, 1)) cnt = 0 for d in ( @@ -292,7 +319,7 @@ def get_config(self): class VideoSwinPatchMerging(layers.Layer): - """Patch Merging Layer. + """Patch Merging Layer for Video Swin Model. Args: input_dim (int): Number of input channels. @@ -325,12 +352,6 @@ def build(self, input_shape): self.built = True def call(self, x): - """The call function. - - Args: - x: Input feature, - shape: (batch_size, depth, height, width, channel). - """ input_shape = ops.shape(x) height, width = ( input_shape[2], @@ -755,10 +776,9 @@ def __init__( zip(self.shift_size, self.window_size) ): if not (0 <= shift < window): - # TODO: Add more description. raise ValueError( - f"shift_size[{i}] must be in the " - "range 0 to window_size[{i}]" + f"shift_size[{i}] must be in the range 0 to less than window_size[{i}], " + f"but got shift_size[{i}]={shift} and window_size[{i}]={window}." ) def build(self, input_shape): From b4f1534b944ca80bfcea44d6fd39300ddb7e8c81 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 12:10:33 +0600 Subject: [PATCH 47/94] update copywrite status: video siwn layers test scripts --- keras_cv/layers/video_swin_layers.py | 2 +- keras_cv/layers/video_swin_layers_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index d651e31e00..eb733f27b8 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py index 862c2e3a77..4e72cb5e9a 100644 --- a/keras_cv/layers/video_swin_layers_test.py +++ b/keras_cv/layers/video_swin_layers_test.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 75c5b665e145761f0611ac7593f66f8f35f35992 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 12:26:45 +0600 Subject: [PATCH 48/94] update copywrite status: video siwn backbone scripts --- .../models/backbones/video_swin/__init__.py | 2 +- .../video_swin/video_swin_aliases.py | 2 +- .../video_swin/video_swin_backbone.py | 24 +++++++++++-------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/__init__.py b/keras_cv/models/backbones/video_swin/__init__.py index 3992ffb59a..0e9cbb5ac9 100644 --- a/keras_cv/models/backbones/video_swin/__init__.py +++ b/keras_cv/models/backbones/video_swin/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 57ccf227dc..5044c2b469 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index ac4e9a07ab..1a6b5c5b69 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -39,28 +39,32 @@ class VideoSwinBackbone(Backbone): `keras.layers.Input()`) to use as video input for the model. Defaults to `None`. include_rescaling (bool, optional): Whether to rescale the inputs. If - set to `True`, inputs will be passed through a - `Rescaling(1/255.0)` layer and normalize with - mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225], + set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer + and normalize with mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]. Defaults to `False`. - patch_size (int | tuple(int)): Patch size. Default: (2,4,4). + patch_size (int | tuple(int)): The patch size for depth, height, and width + dimensions respectively. Default: (2,4,4). embed_dim (int): Number of linear projection output channels. Default to 96. depths (tuple[int]): Depths of each Swin Transformer stage. Default to [2, 2, 6, 2] num_heads (tuple[int]): Number of attention head of each stage. Default to [3, 6, 12, 24] - window_size (int): Window size. Default to [8, 7, 7]. + window_size (int): The window size for depth, height, and width + dimensions respectively. Default to [8, 7, 7]. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default to 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default to True. qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default to None. - drop_rate (float): Dropout rate. - attn_drop_rate (float): Attention dropout rate. Default: 0. - drop_path_rate (float): Stochastic depth rate. Default: 0.2. - patch_norm (bool): If True, add normalization after patch embedding. + drop_rate (float): Float between 0 and 1. Fraction of the input units to drop. + Default: 0. + attn_drop_rate (float): Float between 0 and 1. Attention dropout rate. + Default: 0. + drop_path_rate (float): Float between 0 and 1. Stochastic depth rate. + Default: 0.2. + patch_norm (bool): If True, add layer normalization after patch embedding. Default to False. References: From 0b9808ba70e6ee83be8ef827a62c9a5a78d95a1f Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 15:23:22 +0600 Subject: [PATCH 49/94] bug fixes: video swin backbone layers --- keras_cv/layers/video_swin_layers.py | 19 +++++++------- .../video_swin/video_swin_aliases.py | 2 +- .../video_swin/video_swin_backbone.py | 25 +++++++++++++++---- .../video_swin/video_swin_backbone_test.py | 5 ++-- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index eb733f27b8..bea5119e30 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -656,14 +656,13 @@ def build(self, input_shape): self.built = True def compute_output_shape(self, input_shape): - window_size, _ = get_window_size( - input_shape[1:-1], self.window_size, self.shift_size - ) - depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) - height_pad = self._compute_dim_padded(input_shape[2], window_size[1]) - width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) - if self.downsample is not None: + window_size, _ = get_window_size( + input_shape[1:-1], self.window_size, self.shift_size + ) + depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) + height_pad = self._compute_dim_padded(input_shape[2], window_size[1]) + width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) output_shape = ( input_shape[0], depth_pad, @@ -677,7 +676,7 @@ def compute_output_shape(self, input_shape): def call(self, x, training=None): input_shape = ops.shape(x) - batch_size, depth, height, width, _ = ( + batch_size, depth, height, width, channel = ( input_shape[0], input_shape[1], input_shape[2], @@ -688,7 +687,7 @@ def call(self, x, training=None): for block in self.blocks: x = block(x, self.attn_mask, training=training) - x = ops.reshape(x, [batch_size, depth, height, width, -1]) + x = ops.reshape(x, [batch_size, depth, height, width, channel]) if self.downsample is not None: x = self.downsample(x) @@ -812,7 +811,7 @@ def build(self, input_shape): self.attn.build((None, None, self.input_dim)) self.norm2 = self.norm_layer(axis=-1, epsilon=1e-05) - self.norm2.build((*input_shape[1:-1], self.input_dim)) + self.norm2.build((*input_shape[:-1], self.input_dim)) self.mlp = MLP( output_dim=self.input_dim, diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 5044c2b469..7786804c90 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -1,4 +1,4 @@ -# Copyright 2024 The KerasCV Authors +# Copyright 202 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 1a6b5c5b69..7de6c4bb77 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy from functools import partial import numpy as np @@ -26,6 +26,9 @@ from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone +from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets # noqa: E501 +from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets_with_weights # noqa: E501 +from keras_cv.utils.python_utils import classproperty @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models") class VideoSwinBackbone(Backbone): @@ -126,9 +129,10 @@ def __init__( # std=[58.395, 57.12, 57.375] for normalization. # So, if include_rescaling is set to True, then, to match with the # official scores, following normalization should be added. - x = (x - ops.array([0.485, 0.456, 0.406], dtype=x.dtype)) / ( - ops.array([0.229, 0.224, 0.225], dtype=x.dtype) - ) + x = layers.Normalization( + mean=[0.485, 0.456, 0.406], + variance=[0.229 ** 2, 0.224 ** 2, 0.225 ** 2] + )(x) norm_layer = partial(layers.LayerNormalization, epsilon=1e-05) @@ -162,7 +166,7 @@ def __init__( ) x = layer(x) - x = norm_layer(axis=-1, epsilon=1e-05, name="norm")(x) + x = norm_layer(axis=-1, epsilon=1e-05, name="videoswin_top_norm")(x) super().__init__(inputs=input_spec, outputs=x, **kwargs) self.embed_dim = embed_dim @@ -199,3 +203,14 @@ def get_config(self): } ) return config + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return copy.deepcopy(backbone_presets) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return copy.deepcopy(backbone_presets_with_weights) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 4ab2787687..50c100cb7f 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,6 @@ class TestViTDetBackbone(TestCase): @pytest.mark.large def test_call(self): - self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinSBackbone() x = np.ones((1, 32, 224, 224, 3)) x_out = ops.convert_to_numpy(model(x)) @@ -36,7 +35,7 @@ def test_call(self): np.prod(tuple(x.shape)) for x in model.trainable_variables ) self.assertEqual(x_out.shape, (1, 16, 7, 7, 768)) - self.assertEqual(num_parameters, 49_509_078) + self.assertEqual(num_parameters, 27_850_470) @pytest.mark.extra_large def teat_save(self): From 0a4e2cb5baf38b2a822a05989fdb1c19b72207b8 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 15:31:51 +0600 Subject: [PATCH 50/94] update get config of video swin backbone --- keras_cv/models/backbones/video_swin/video_swin_backbone.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 7de6c4bb77..33c11e877c 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -169,6 +169,8 @@ def __init__( x = norm_layer(axis=-1, epsilon=1e-05, name="videoswin_top_norm")(x) super().__init__(inputs=input_spec, outputs=x, **kwargs) + self.include_rescaling = include_rescaling + self.input_tensor = input_tensor self.embed_dim = embed_dim self.patch_size = patch_size self.window_size = window_size @@ -188,6 +190,9 @@ def get_config(self): config = super().get_config() config.update( { + "include_rescaling": self.include_rescaling, + "input_shape": self.input_shape[1:], + "input_tensor": self.input_tensor, "embed_dim": self.embed_dim, "patch_norm": self.patch_norm, "window_size": self.window_size, From fb732d0e9cd2bcb3a97ffc64c0cf196e1a3c79c2 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 15:36:12 +0600 Subject: [PATCH 51/94] enable: video swin backbone test cases --- .../models/backbones/video_swin/video_swin_backbone_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 50c100cb7f..cdf1160bf9 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -39,7 +39,6 @@ def test_call(self): @pytest.mark.extra_large def teat_save(self): - self.skipTest("TODO: Enable after Kaggle model is public") # saving test model = VideoSwinSBackbone() x = np.ones((1, 32, 224, 224, 3)) @@ -52,7 +51,6 @@ def teat_save(self): @pytest.mark.extra_large def test_fit(self): - self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinSBackbone() x = np.ones((1, 32, 224, 224, 3)) y = np.zeros((1, 16, 7, 7, 768)) @@ -60,7 +58,6 @@ def test_fit(self): model.fit(x, y, epochs=1) def test_pyramid_level_inputs_error(self): - self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinSBackbone() with self.assertRaises(NotImplementedError, msg="doesn't compute"): model.pyramid_level_inputs From 44433354d643906e066f1b62283014a7ddb8d66e Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 15:48:12 +0600 Subject: [PATCH 52/94] update: video swin backbone test cases --- .../video_swin/video_swin_backbone_test.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index cdf1160bf9..201cd4de22 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -26,21 +26,24 @@ class TestViTDetBackbone(TestCase): + @pytest.mark.large def test_call(self): - model = VideoSwinSBackbone() - x = np.ones((1, 32, 224, 224, 3)) + model = VideoSwinSBackbone( + include_rescaling=True, input_shape=(8,256,256,3) + ) + x = np.ones((1, 8, 256, 256, 3)) x_out = ops.convert_to_numpy(model(x)) num_parameters = sum( np.prod(tuple(x.shape)) for x in model.trainable_variables ) - self.assertEqual(x_out.shape, (1, 16, 7, 7, 768)) - self.assertEqual(num_parameters, 27_850_470) + self.assertEqual(x_out.shape, (1, 4, 8, 8, 768)) + self.assertEqual(num_parameters, 27_663_894) @pytest.mark.extra_large def teat_save(self): # saving test - model = VideoSwinSBackbone() + model = VideoSwinSBackbone(include_rescaling=False) x = np.ones((1, 32, 224, 224, 3)) x_out = ops.convert_to_numpy(model(x)) path = os.path.join(self.get_temp_dir(), "model.keras") @@ -51,13 +54,13 @@ def teat_save(self): @pytest.mark.extra_large def test_fit(self): - model = VideoSwinSBackbone() + model = VideoSwinSBackbone(include_rescaling=False) x = np.ones((1, 32, 224, 224, 3)) y = np.zeros((1, 16, 7, 7, 768)) model.compile(optimizer="adam", loss="mse", metrics=["mse"]) model.fit(x, y, epochs=1) def test_pyramid_level_inputs_error(self): - model = VideoSwinSBackbone() + model = VideoSwinSBackbone(include_rescaling=False) with self.assertRaises(NotImplementedError, msg="doesn't compute"): model.pyramid_level_inputs From f3411cbefb3121f7bd0ea49a7156fe2c3f183d9c Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 15:50:23 +0600 Subject: [PATCH 53/94] update: video swin backbone preset test cases --- .../backbones/video_swin/video_swin_backbone_presets_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py index 1a3a5519c4..c8abba5c11 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py @@ -36,7 +36,6 @@ def setUp(self): self.input_batch = np.ones(shape=(1, 32, 224, 224, 3)) def test_applications_model_output(self): - self.skipTest("TODO: Enable after Kaggle model is public") model = VideoSwinBackbone() model(self.input_batch) From 00c67ba4adf6cbb8fbe924df901851ffa33c67b7 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 16:04:12 +0600 Subject: [PATCH 54/94] run formatters --- keras_cv/layers/video_swin_layers.py | 12 ++++--- .../video_swin/video_swin_backbone.py | 35 +++++++++++++------ .../video_swin/video_swin_backbone_test.py | 2 +- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index bea5119e30..63d3b1f464 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -150,7 +150,8 @@ def compute_mask(depth, height, width, window_size, shift_size): (width - window_size[2]) // shift_size[2] + 1 )`. - """ + """ # noqa: E501 + img_mask = np.zeros((1, depth, height, width, 1)) cnt = 0 for d in ( @@ -661,7 +662,9 @@ def compute_output_shape(self, input_shape): input_shape[1:-1], self.window_size, self.shift_size ) depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) - height_pad = self._compute_dim_padded(input_shape[2], window_size[1]) + height_pad = self._compute_dim_padded( + input_shape[2], window_size[1] + ) width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) output_shape = ( input_shape[0], @@ -776,8 +779,9 @@ def __init__( ): if not (0 <= shift < window): raise ValueError( - f"shift_size[{i}] must be in the range 0 to less than window_size[{i}], " - f"but got shift_size[{i}]={shift} and window_size[{i}]={window}." + f"shift_size[{i}] must be in the range 0 to less than " + f"window_size[{i}], but got shift_size[{i}]={shift} " + f"and window_size[{i}]={window}." ) def build(self, input_shape): diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 33c11e877c..5a19a98372 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -19,17 +19,20 @@ from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras -from keras_cv.backend import ops from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone - -from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets # noqa: E501 -from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets_with_weights # noqa: E501 +from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import ( # noqa: E501 + backbone_presets, +) +from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import ( # noqa: E501 + backbone_presets_with_weights, +) from keras_cv.utils.python_utils import classproperty + @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models") class VideoSwinBackbone(Backbone): """A Video Swin Transformer backbone model. @@ -42,10 +45,10 @@ class VideoSwinBackbone(Backbone): `keras.layers.Input()`) to use as video input for the model. Defaults to `None`. include_rescaling (bool, optional): Whether to rescale the inputs. If - set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer + set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer and normalize with mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]. Defaults to `False`. - patch_size (int | tuple(int)): The patch size for depth, height, and width + patch_size (int | tuple(int)): The patch size for depth, height, and width dimensions respectively. Default: (2,4,4). embed_dim (int): Number of linear projection output channels. Default to 96. @@ -53,7 +56,7 @@ class VideoSwinBackbone(Backbone): Default to [2, 2, 6, 2] num_heads (tuple[int]): Number of attention head of each stage. Default to [3, 6, 12, 24] - window_size (int): The window size for depth, height, and width + window_size (int): The window size for depth, height, and width dimensions respectively. Default to [8, 7, 7]. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default to 4. @@ -63,13 +66,23 @@ class VideoSwinBackbone(Backbone): Default to None. drop_rate (float): Float between 0 and 1. Fraction of the input units to drop. Default: 0. - attn_drop_rate (float): Float between 0 and 1. Attention dropout rate. + attn_drop_rate (float): Float between 0 and 1. Attention dropout rate. Default: 0. - drop_path_rate (float): Float between 0 and 1. Stochastic depth rate. + drop_path_rate (float): Float between 0 and 1. Stochastic depth rate. Default: 0.2. patch_norm (bool): If True, add layer normalization after patch embedding. Default to False. + Example: + ```python + # Build video swin backbone without top layer + model = VideoSwinSBackbone( + include_rescaling=True, input_shape=(8, 256, 256, 3), + ) + videos = tf.ones((1, 8, 256, 256, 3)) + outputs = model.predict(videos) + ``` + References: - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) - [Official Code](https://github.com/SwinTransformer/Video-Swin-Transformer) @@ -131,7 +144,7 @@ def __init__( # official scores, following normalization should be added. x = layers.Normalization( mean=[0.485, 0.456, 0.406], - variance=[0.229 ** 2, 0.224 ** 2, 0.225 ** 2] + variance=[0.229**2, 0.224**2, 0.225**2], )(x) norm_layer = partial(layers.LayerNormalization, epsilon=1e-05) @@ -208,7 +221,7 @@ def get_config(self): } ) return config - + @classproperty def presets(cls): """Dictionary of preset names and configurations.""" diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 201cd4de22..b711c9f554 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -30,7 +30,7 @@ class TestViTDetBackbone(TestCase): @pytest.mark.large def test_call(self): model = VideoSwinSBackbone( - include_rescaling=True, input_shape=(8,256,256,3) + include_rescaling=True, input_shape=(8, 256, 256, 3) ) x = np.ones((1, 8, 256, 256, 3)) x_out = ops.convert_to_numpy(model(x)) From 9d3ab2ed1908bee977e9047d6f89298efebacc41 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 16:16:45 +0600 Subject: [PATCH 55/94] fix typos: video swin backbone test cases --- .../video_swin/video_swin_backbone_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index b711c9f554..7af9c0b7b5 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -19,17 +19,17 @@ from keras_cv.backend import keras from keras_cv.backend import ops -from keras_cv.models.backbones.video_swin.video_swin_aliases import ( - VideoSwinSBackbone, +from keras_cv.models.backbones.video_swin.video_swin_backbone import ( + VideoSwinBackbone, ) from keras_cv.tests.test_case import TestCase -class TestViTDetBackbone(TestCase): +class TestVideoSwinSBackbone(TestCase): @pytest.mark.large def test_call(self): - model = VideoSwinSBackbone( + model = VideoSwinBackbone( # TODO: replace with aliases include_rescaling=True, input_shape=(8, 256, 256, 3) ) x = np.ones((1, 8, 256, 256, 3)) @@ -43,7 +43,7 @@ def test_call(self): @pytest.mark.extra_large def teat_save(self): # saving test - model = VideoSwinSBackbone(include_rescaling=False) + model = VideoSwinBackbone(include_rescaling=False) x = np.ones((1, 32, 224, 224, 3)) x_out = ops.convert_to_numpy(model(x)) path = os.path.join(self.get_temp_dir(), "model.keras") @@ -54,13 +54,13 @@ def teat_save(self): @pytest.mark.extra_large def test_fit(self): - model = VideoSwinSBackbone(include_rescaling=False) + model = VideoSwinBackbone(include_rescaling=False) x = np.ones((1, 32, 224, 224, 3)) y = np.zeros((1, 16, 7, 7, 768)) model.compile(optimizer="adam", loss="mse", metrics=["mse"]) model.fit(x, y, epochs=1) def test_pyramid_level_inputs_error(self): - model = VideoSwinSBackbone(include_rescaling=False) + model = VideoSwinBackbone(include_rescaling=False) with self.assertRaises(NotImplementedError, msg="doesn't compute"): model.pyramid_level_inputs From 5bdc8b45d46beb030a89784c4f092536af055f7b Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 16:44:21 +0600 Subject: [PATCH 56/94] add: non implemented property for test reason --- .../models/backbones/video_swin/video_swin_backbone.py | 7 +++++++ .../backbones/video_swin/video_swin_backbone_test.py | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 5a19a98372..0488758372 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -232,3 +232,10 @@ def presets_with_weights(cls): """Dictionary of preset names and configurations that include weights.""" return copy.deepcopy(backbone_presets_with_weights) + + @property + def pyramid_level_inputs(self): + raise NotImplementedError( + "The `ViTDetBackbone` model doesn't compute" + " pyramid level features." + ) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 7af9c0b7b5..9adc543626 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -59,8 +59,3 @@ def test_fit(self): y = np.zeros((1, 16, 7, 7, 768)) model.compile(optimizer="adam", loss="mse", metrics=["mse"]) model.fit(x, y, epochs=1) - - def test_pyramid_level_inputs_error(self): - model = VideoSwinBackbone(include_rescaling=False) - with self.assertRaises(NotImplementedError, msg="doesn't compute"): - model.pyramid_level_inputs From cb5da28dbe77c1c7d213af10b58ddec09540ad18 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 5 Mar 2024 16:46:47 +0600 Subject: [PATCH 57/94] fix: typos --- keras_cv/models/backbones/video_swin/video_swin_backbone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 0488758372..07008bf0c8 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -236,6 +236,6 @@ def presets_with_weights(cls): @property def pyramid_level_inputs(self): raise NotImplementedError( - "The `ViTDetBackbone` model doesn't compute" + "The `VideoSwinBackbone` model doesn't compute" " pyramid level features." ) From 82a84979737c3a4c38a420bbb7b8e7cbed326413 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 13:24:09 +0600 Subject: [PATCH 58/94] add: video classifier test --- .../classification/video_classifier_test.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 keras_cv/models/classification/video_classifier_test.py diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py new file mode 100644 index 0000000000..c506c993c9 --- /dev/null +++ b/keras_cv/models/classification/video_classifier_test.py @@ -0,0 +1,98 @@ +# Copyright 2024 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for VideoClassifier.""" + + +import os + +import numpy as np +import pytest +import tensorflow as tf +from absl.testing import parameterized + +from keras_cv.backend import keras +from keras_cv.backend import ops +from keras_cv.models.backbones.video_swin.video_swin_backbone import ( + VideoSwinBackbone, # TODO: update with aliases +) +from keras_cv.models.classification.video_classifier import VideoClassifier +from keras_cv.tests.test_case import TestCase + + +class VideoClassifierTest(TestCase): + def setUp(self): + self.input_batch = np.ones(shape=(2, 8, 224, 224, 3)) + self.dataset = tf.data.Dataset.from_tensor_slices( + (self.input_batch, tf.one_hot(tf.ones((10,), dtype="int32"), 2)) + ).batch(4) + + def test_valid_call(self): + model = VideoClassifier( + backbone=VideoSwinBackbone(include_rescaling=False), + num_classes=10, + ) + model(self.input_batch) + + @parameterized.named_parameters( + ("jit_compile_false", False), ("jit_compile_true", True) + ) + @pytest.mark.large # Fit is slow, so mark these large. + def test_classifier_fit(self, jit_compile): + model = VideoClassifier( + backbone=VideoSwinBackbone(include_rescaling=False), + num_classes=10, + ) + model.compile( + loss="categorical_crossentropy", + optimizer="adam", + metrics=["accuracy"], + jit_compile=jit_compile, + ) + model.fit(self.dataset) + + @parameterized.named_parameters( + ("avg_pooling", "avg"), ("max_pooling", "max") + ) + def test_pooling_arg_call(self, pooling): + model = VideoClassifier( + backbone=VideoSwinBackbone(include_rescaling=False), + num_classes=10, + pooling=pooling, + ) + model(self.input_batch) + + @pytest.mark.large # Saving is slow, so mark these large. + def test_saved_model(self): + model = VideoClassifier( + backbone=VideoSwinBackbone(include_rescaling=False), + num_classes=2, + ) + model_output = model(self.input_batch) + save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras") + model.save(save_path) + restored_model = keras.models.load_model(save_path) + + # Check we got the real object back. + self.assertIsInstance(restored_model, VideoClassifier) + + # Check that output matches. + restored_output = restored_model(self.input_batch) + self.assertAllClose( + ops.convert_to_numpy(model_output), + ops.convert_to_numpy(restored_output), + ) + + +if __name__ == "__main__": + tf.test.main() From e2f5056412ce9ea0dd42cde95a2f83a725c0db80 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 14:19:18 +0600 Subject: [PATCH 59/94] update: video classifier test --- .../classification/video_classifier_test.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py index c506c993c9..c647ba5480 100644 --- a/keras_cv/models/classification/video_classifier_test.py +++ b/keras_cv/models/classification/video_classifier_test.py @@ -32,17 +32,19 @@ class VideoClassifierTest(TestCase): def setUp(self): - self.input_batch = np.ones(shape=(2, 8, 224, 224, 3)) + self.input_batch = np.ones(shape=(10, 8, 224, 224, 3)) self.dataset = tf.data.Dataset.from_tensor_slices( - (self.input_batch, tf.one_hot(tf.ones((10,), dtype="int32"), 2)) + (self.input_batch, tf.one_hot(tf.ones((10,), dtype="int32"), 10)) ).batch(4) def test_valid_call(self): model = VideoClassifier( - backbone=VideoSwinBackbone(include_rescaling=False), + backbone=VideoSwinBackbone( + input_shape=(8, 224, 224, 3), include_rescaling=False + ), num_classes=10, ) - model(self.input_batch) + model.predict(self.input_batch) @parameterized.named_parameters( ("jit_compile_false", False), ("jit_compile_true", True) @@ -50,7 +52,9 @@ def test_valid_call(self): @pytest.mark.large # Fit is slow, so mark these large. def test_classifier_fit(self, jit_compile): model = VideoClassifier( - backbone=VideoSwinBackbone(include_rescaling=False), + backbone=VideoSwinBackbone( + input_shape=(8, 224, 224, 3), include_rescaling=True + ), num_classes=10, ) model.compile( @@ -70,15 +74,17 @@ def test_pooling_arg_call(self, pooling): num_classes=10, pooling=pooling, ) - model(self.input_batch) + model.predict(self.input_batch) @pytest.mark.large # Saving is slow, so mark these large. def test_saved_model(self): model = VideoClassifier( - backbone=VideoSwinBackbone(include_rescaling=False), - num_classes=2, + backbone=VideoSwinBackbone( + input_shape=(8, 224, 224, 3), include_rescaling=False + ), + num_classes=10, ) - model_output = model(self.input_batch) + model_output = model.predict(self.input_batch) save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras") model.save(save_path) restored_model = keras.models.load_model(save_path) @@ -87,7 +93,7 @@ def test_saved_model(self): self.assertIsInstance(restored_model, VideoClassifier) # Check that output matches. - restored_output = restored_model(self.input_batch) + restored_output = restored_model.predict(self.input_batch) self.assertAllClose( ops.convert_to_numpy(model_output), ops.convert_to_numpy(restored_output), From 146f32fabe1e3308062d711d4df291d6aa2955a5 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 14:41:59 +0600 Subject: [PATCH 60/94] update: video classifier test input shape --- keras_cv/models/classification/video_classifier_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py index c647ba5480..6e8b494ac4 100644 --- a/keras_cv/models/classification/video_classifier_test.py +++ b/keras_cv/models/classification/video_classifier_test.py @@ -70,7 +70,9 @@ def test_classifier_fit(self, jit_compile): ) def test_pooling_arg_call(self, pooling): model = VideoClassifier( - backbone=VideoSwinBackbone(include_rescaling=False), + backbone=VideoSwinBackbone( + input_shape=(8, 224, 224, 3), include_rescaling=True + ), num_classes=10, pooling=pooling, ) From d25746bd5e4fde42915832119a1f2877a620d164 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 15:06:04 +0600 Subject: [PATCH 61/94] bug fix: mlp layer build method --- keras_cv/layers/video_swin_layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 63d3b1f464..5eda8dd66f 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -212,7 +212,7 @@ def __init__( def build(self, input_shape): self.fc1.build(input_shape) - self.fc2.build((*input_shape[1:-1], self.hidden_dim)) + self.fc2.build((*input_shape[:-1], self.hidden_dim)) self.built = True def call(self, x, training=None): @@ -823,7 +823,7 @@ def build(self, input_shape): activation=self._activation_identifier, drop_rate=self.drop_rate, ) - self.mlp.build((*input_shape[1:-1], self.input_dim)) + self.mlp.build((*input_shape[:-1], self.input_dim)) self.built = True def first_forward(self, x, mask_matrix, training): From 9779ad44130403bd3be5538d72a55f79036fb8f9 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 15:13:43 +0600 Subject: [PATCH 62/94] updated: swin back layer build method --- keras_cv/layers/video_swin_layers.py | 34 +++++++++++++++------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 5eda8dd66f..429a22529b 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -613,14 +613,24 @@ def _compute_dim_padded(self, input_dim, window_dim_size): ) def build(self, input_shape): - window_size, shift_size = get_window_size( + self.window_size, self.shift_size = get_window_size( input_shape[1:-1], self.window_size, self.shift_size ) - depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) - height_pad = self._compute_dim_padded(input_shape[2], window_size[1]) - width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) + self.depth_pad = self._compute_dim_padded( + input_shape[1], self.window_size[0] + ) + self.height_pad = self._compute_dim_padded( + input_shape[2], self.window_size[1] + ) + self.width_pad = self._compute_dim_padded( + input_shape[3], self.window_size[2] + ) self.attn_mask = compute_mask( - depth_pad, height_pad, width_pad, window_size, shift_size + self.depth_pad, + self.height_pad, + self.width_pad, + self.window_size, + self.shift_size, ) # build blocks @@ -658,19 +668,11 @@ def build(self, input_shape): def compute_output_shape(self, input_shape): if self.downsample is not None: - window_size, _ = get_window_size( - input_shape[1:-1], self.window_size, self.shift_size - ) - depth_pad = self._compute_dim_padded(input_shape[1], window_size[0]) - height_pad = self._compute_dim_padded( - input_shape[2], window_size[1] - ) - width_pad = self._compute_dim_padded(input_shape[3], window_size[2]) output_shape = ( input_shape[0], - depth_pad, - height_pad // 2, - width_pad // 2, + self.depth_pad, + self.height_pad // 2, + self.width_pad // 2, 2 * self.input_dim, ) return output_shape From 7fa3f83ea03f38e52619450f82e5660b509eb333 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 17:24:02 +0600 Subject: [PATCH 63/94] bug fix: use tf.TensorShape in compute_output_shape method --- keras_cv/layers/video_swin_layers.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 429a22529b..9ce3375ff5 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +import tensorflow as tf from keras import layers from keras_cv.api_export import keras_cv_export @@ -668,12 +669,16 @@ def build(self, input_shape): def compute_output_shape(self, input_shape): if self.downsample is not None: - output_shape = ( - input_shape[0], - self.depth_pad, - self.height_pad // 2, - self.width_pad // 2, - 2 * self.input_dim, + # TODO: remove tensorflow dependencies. + # GitHub issue: fix https://github.com/keras-team/keras/issues/19259 # noqa: E501 + output_shape = tf.TensorShape( + [ + input_shape[0], + self.depth_pad, + self.height_pad // 2, + self.width_pad // 2, + 2 * self.input_dim, + ] ) return output_shape From c8aea501528924789e8bbc0916a0d179c17032c1 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 19:19:56 +0600 Subject: [PATCH 64/94] update: video_classifier_test model.predict to model.call --- keras_cv/models/classification/video_classifier_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py index 6e8b494ac4..2fcd0d5ae0 100644 --- a/keras_cv/models/classification/video_classifier_test.py +++ b/keras_cv/models/classification/video_classifier_test.py @@ -44,7 +44,7 @@ def test_valid_call(self): ), num_classes=10, ) - model.predict(self.input_batch) + model(self.input_batch) @parameterized.named_parameters( ("jit_compile_false", False), ("jit_compile_true", True) @@ -76,7 +76,7 @@ def test_pooling_arg_call(self, pooling): num_classes=10, pooling=pooling, ) - model.predict(self.input_batch) + model(self.input_batch) @pytest.mark.large # Saving is slow, so mark these large. def test_saved_model(self): From 8287395e4b89ca7479afc2a9ffbede1a69c94a8b Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 6 Mar 2024 19:22:04 +0600 Subject: [PATCH 65/94] update test cases and format the code --- keras_cv/models/classification/video_classifier_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py index 2fcd0d5ae0..ac188e292c 100644 --- a/keras_cv/models/classification/video_classifier_test.py +++ b/keras_cv/models/classification/video_classifier_test.py @@ -86,7 +86,7 @@ def test_saved_model(self): ), num_classes=10, ) - model_output = model.predict(self.input_batch) + model_output = model(self.input_batch) save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras") model.save(save_path) restored_model = keras.models.load_model(save_path) @@ -95,7 +95,7 @@ def test_saved_model(self): self.assertIsInstance(restored_model, VideoClassifier) # Check that output matches. - restored_output = restored_model.predict(self.input_batch) + restored_output = restored_model(self.input_batch) self.assertAllClose( ops.convert_to_numpy(model_output), ops.convert_to_numpy(restored_output), From e9a39978e719081d820822b06d684df23276e96c Mon Sep 17 00:00:00 2001 From: innat Date: Sat, 9 Mar 2024 18:48:01 +0600 Subject: [PATCH 66/94] update docstrings and preset config --- keras_cv/layers/video_swin_layers.py | 154 ++++++++++++------ .../video_swin/video_swin_aliases.py | 10 ++ .../video_swin/video_swin_backbone.py | 2 +- .../video_swin/video_swin_backbone_presets.py | 39 ++++- .../models/classification/video_classifier.py | 12 +- .../video_classifier_presets.py | 80 +++++++-- .../classification/video_classifier_test.py | 2 +- 7 files changed, 222 insertions(+), 77 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 9ce3375ff5..98869a76f2 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -23,17 +23,22 @@ def window_partition(x, window_size): - """Partitions the input tensor into windows of specified size. + """Partitions a video tensor into non-overlapping windows of a specified size. Args: - x (Tensor): Input tensor of shape `(batch_size, depth, height, width, channel)`. - window_size (tuple[int]): Size of the window in each dimension (depth, height, width). + x: A tensor with shape (B, D, H, W, C), where: + - B: Batch size + - D: Number of frames (depth) in the video + - H: Height of the video frames + - W: Width of the video frames + - C: Number of channels in the video (e.g., RGB for color) + window_size: A tuple of ints of size 3 representing the window size + along each dimension (depth, height, width). Returns: - Tensor: Windows of shape `(batch_size*num_windows, window_size*window_size, channel)`, - where `num_windows = ( - depth//window_size[0]) * (height//window_size[1]) * (width//window_size[2] - )`. + A tensor with shape (num_windows * B, window_size[0], window_size[1], window_size[2], C), + where each window from the video is a sub-tensor containing the specified + number of frames and the corresponding spatial window. """ # noqa: E501 input_shape = ops.shape(x) @@ -68,18 +73,26 @@ def window_partition(x, window_size): def window_reverse(windows, window_size, batch_size, depth, height, width): - """Reconstructs the original tensor from windows of specified size. + """Reconstructs the original video tensor from its partitioned windows. + + This function assumes the windows were created using the `window_partition` function + with the same `window_size`. Args: - windows (Tensor): Windows of shape `(batch_size*num_windows, window_size, window_size, channel)`. - window_size (tuple[int]): Size of the window in each dimension `(depth, height, width)`. - batch_size (int): Batch size. - depth (int): Depth of the original tensor. - height (int): Height of the original tensor. - width (int): Width of the original tensor. + windows: A tensor with shape (num_windows * batch_size, window_size[0], + window_size[1], window_size[2], channels), where: + - num_windows: Number of windows created during partitioning + - channels: Number of channels in the video (same as in `window_partition`) + window_size: A tuple of ints of size 3 representing the window size used + during partitioning (same as in `window_partition`). + batch_size: Batch size of the original video tensor (same as in `window_partition`). + depth: Number of frames (depth) in the original video tensor (same as in `window_partition`). + height: Height of the video frames in the original tensor (same as in `window_partition`). + width: Width of the video frames in the original tensor (same as in `window_partition`). Returns: - Tensor: Reconstructed tensor of shape `(batch_size, depth, height, width, channel)`. + A tensor with shape (batch_size, depth, height, width, channels), representing the + original video reconstructed from the provided windows. """ # noqa: E501 x = ops.reshape( windows, @@ -100,18 +113,30 @@ def window_reverse(windows, window_size, batch_size, depth, height, width): def get_window_size(x_size, window_size, shift_size=None): - """Computing window size based on: "Liu et al., - Swin Transformer: Hierarchical Vision Transformer using Shifted Windows - " - https://github.com/microsoft/Swin-Transformer + """Computes the appropriate window size and potentially shift size for Swin Transformer. + + This function implements the logic from the Swin Transformer paper by Ze Liu et al. + (https://arxiv.org/abs/2103.14030) to determine suitable window sizes + based on the input size and the provided base window size. Args: - x_size: input size. - window_size: local window size. - shift_size: window shifting size. + x_size: A tuple of ints of size 3 representing the input size (depth, height, width) + of the data (e.g., video). + window_size: A tuple of ints of size 3 representing the base window size + (depth, height, width) to use for partitioning. + shift_size: A tuple of ints of size 3 (optional) representing the window + shifting size (depth, height, width) for shifted window processing + used in Swin Transformer. If not provided, only window size is computed. Returns: - x: window_size, shift_size + A tuple or a pair of tuples: + - If `shift_size` is None, returns a single tuple representing the adjusted + window size that may be smaller than the provided `window_size` to ensure + it doesn't exceed the input size along any dimension. + - If `shift_size` is provided, returns a pair of tuples. The first tuple + represents the adjusted window size, and the second tuple represents the + adjusted shift size. The adjustments ensure both window size and shift size + do not exceed the corresponding dimensions in the input data. """ # noqa: E501 use_window_size = list(window_size) @@ -132,25 +157,33 @@ def get_window_size(x_size, window_size, shift_size=None): def compute_mask(depth, height, width, window_size, shift_size): - """Computes attention mask for sliding window self-attention mechanism. + """Computes an attention mask for a sliding window self-attention mechanism + used in Video Swin Transformer. + + This function creates a mask to indicate which windows can attend to each other + during the self-attention operation. It considers non-overlapping and potentially + shifted windows based on the provided window size and shift size. Args: - depth (int): Depth of the input video. - height (int): Height of the input video. - width (int): Width of the input video. - window_size (tuple[int]): Size of the sliding window in each dimension (depth, height, width). - shift_size (tuple[int]): Size of the shifting step in each dimension (depth, height, width). + depth (int): Depth (number of frames) of the input video. + height (int): Height of the video frames. + width (int): Width of the video frames. + window_size (tuple[int]): Size of the sliding window in each dimension + (depth, height, width). + shift_size (tuple[int]): Size of the shifting step in each dimension + (depth, height, width). Returns: - Tensor: Attention mask of shape `(batch_size, num_windows, num_windows)`, - where `num_windows = ( - (depth - window_size[0]) // shift_size[0] + 1 - ) * ( - (height - window_size[1]) // shift_size[1] + 1 - ) * ( - (width - window_size[2]) // shift_size[2] + 1 - )`. - + A tensor of shape (batch_size, num_windows, num_windows), where: + - batch_size: Assumed to be 1 in this function. + - num_windows: Total number of windows covering the entire input based on + the formula: + (depth - window_size[0]) // shift_size[0] + 1) * + (height - window_size[1]) // shift_size[1] + 1) * + (width - window_size[2]) // shift_size[2] + 1) + Each element (attn_mask[i, j]) represents the attention weight between + window i and window j. A value of -100.0 indicates high negative attention + (preventing information flow), 0.0 indicates no mask effect. """ # noqa: E501 img_mask = np.zeros((1, depth, height, width, 1)) @@ -241,10 +274,15 @@ def get_config(self): "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers" ) class VideoSwinPatchingAndEmbedding(keras.Model): - """Video to Patch Embedding layer for Video Swin Model. + """Video to Patch Embedding layer for Video Swin Transformer models. + + This layer performs the initial step in a Video Swin Transformer architecture by + partitioning the input video into 3D patches and embedding them into a vector + dimensional space. Args: - patch_size (int): Patch token size. Default: (2,4,4). + patch_size (int): Size of the patch along each dimension + (depth, height, width). Default: (2,4,4). embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (keras.layers, optional): Normalization layer. Default: None @@ -321,10 +359,14 @@ def get_config(self): class VideoSwinPatchMerging(layers.Layer): - """Patch Merging Layer for Video Swin Model. + """Patch Merging Layer in Video Swin Transformer models. + + This layer performs a downsampling step by merging four neighboring patches + from the previous layer into a single patch in the output. It achieves this + by concatenation and linear projection. Args: - input_dim (int): Number of input channels. + input_dim (int): Number of input channels in the feature maps. norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization @@ -392,10 +434,12 @@ def get_config(self): class VideoSwinWindowAttention(keras.Model): - """Window based multi-head self attention (W-MSA) module with relative position bias. + """It tackles long-range video dependencies by splitting features into windows + and using relative position bias within each window for focused attention. It supports both of shifted and non-shifted window. Args: + input_dim (int): The number of input channels in the feature maps. window_size (tuple[int]): The temporal length, height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True @@ -554,7 +598,7 @@ def get_config(self): class VideoSwinBasicLayer(keras.Model): - """A basic Swin Transformer layer for one stage. + """A basic Video Swin Transformer layer for one stage. Args: input_dim (int): Number of feature channels @@ -670,7 +714,7 @@ def build(self, input_shape): def compute_output_shape(self, input_shape): if self.downsample is not None: # TODO: remove tensorflow dependencies. - # GitHub issue: fix https://github.com/keras-team/keras/issues/19259 # noqa: E501 + # GitHub issue: https://github.com/keras-team/keras/issues/19259 # noqa: E501 output_shape = tf.TensorShape( [ input_shape[0], @@ -716,9 +760,9 @@ def get_config(self): "depth": self.depth, "qkv_bias": self.qkv_bias, "qk_scale": self.qk_scale, - "drop": self.drop, - "attn_drop": self.attn_drop, - "drop_path": self.drop_path, + "drop_rate": self.drop_rate, + "attn_drop_rate": self.attn_drop_rate, + "drop_path_rate": self.drop_path_rate, } ) return config @@ -728,21 +772,25 @@ def get_config(self): "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers" ) class VideoSwinTransformerBlock(keras.Model): - """Swin Transformer Block. + """Video Swin Transformer Block. Args: input_dim (int): Number of feature channels. num_heads (int): Number of attention heads. - window_size (tuple[int]): Window size. - shift_size (tuple[int]): Shift size for SW-MSA. + window_size (tuple[int]): Local window size. Default: (2, 7, 7) + shift_size (tuple[int]): Shift size for SW-MSA. Default: (0, 0, 0) mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + Default: 4.0 + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. + Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + Default: None drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optionalc): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (keras.layers.Activation, optional): Activation layer. Default: gelu - norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization + norm_layer (keras.layers, optional): Normalization layer. + Default: LayerNormalization References: - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 7786804c90..482d84c0f7 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -120,6 +120,16 @@ def presets(cls): "videoswin_base_kinetics400": copy.deepcopy( backbone_presets["videoswin_base_kinetics400"] ), + # TODO: update: should these be here or separate class for each! + # "videoswin_base_kinetics400_imagenet22k": copy.deepcopy( + # backbone_presets["videoswin_base_kinetics400_imagenet22k"] + # ), + # "videoswin_base_kinetics600_imagenet22k": copy.deepcopy( + # backbone_presets["videoswin_base_kinetics600_imagenet22k"] + # ), + # "videoswin_base_something_something_v2": copy.deepcopy( + # backbone_presets["videoswin_base_something_something_v2"] + # ), } @classproperty diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 07008bf0c8..084b5a074f 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -79,7 +79,7 @@ class VideoSwinBackbone(Backbone): model = VideoSwinSBackbone( include_rescaling=True, input_shape=(8, 256, 256, 3), ) - videos = tf.ones((1, 8, 256, 256, 3)) + videos = keras.ops.ones((1, 8, 256, 256, 3)) outputs = model.predict(videos) ``` diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py index bd06d137c3..5b8472f890 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -46,7 +46,7 @@ "description": ( "A tiny Video Swin backbone architecture. " "It is pretrained on ImageNet 1K dataset, and " - "trained on Kinetics 400 dataset." + "trained on Kinetics 400 dataset. " ), "params": 27_850_470, "official_name": "VideoSwinT", @@ -58,7 +58,10 @@ "description": ( "A small Video Swin backbone architecture. " "It is pretrained on ImageNet 1K dataset, and " - "trained on Kinetics 400 dataset." + "trained on Kinetics 400 dataset. " + "Published weight is capable of scoring " + "80.6% top1 and 94.5% top5 accuracy on the " + "Kinetics 400 dataset" ), "params": 49_509_078, "official_name": "VideoSwinS", @@ -70,19 +73,40 @@ "description": ( "A base Video Swin backbone architecture. " "It is pretrained on ImageNet 1K dataset, and " - "trained on Kinetics 400 dataset." + "trained on Kinetics 400 dataset. " + "Published weight is capable of scoring " + "80.6% top1 and 94.6% top5 accuracy on the " + "Kinetics 400 dataset" ), "params": 87_638_984, "official_name": "VideoSwinB", "path": "video_swin", }, }, - "videoswin_base_kinetics600": { + "videoswin_base_kinetics400_imagenet22k": { "metadata": { "description": ( "A base Video Swin backbone architecture. " "It is pretrained on ImageNet 22K dataset, and " - "trained on Kinetics 600 dataset." + "trained on Kinetics 400 dataset. " + "Published weight is capable of scoring " + "82.7% top1 and 95.5% top5 accuracy on the " + "Kinetics 400 dataset" + ), + "params": 87_638_984, + "official_name": "VideoSwinB", + "path": "video_swin", + }, + }, + "videoswin_base_kinetics600_imagenet22k": { + "metadata": { + "description": ( + "A base Video Swin backbone architecture. " + "It is pretrained on ImageNet 22K dataset, and " + "trained on Kinetics 600 dataset. " + "Published weight is capable of scoring " + "84.0% top1 and 96.5% top5 accuracy on the " + "Kinetics 600 dataset" ), "params": 87_638_984, "official_name": "VideoSwinB", @@ -94,7 +118,10 @@ "description": ( "A base Video Swin backbone architecture. " "It is pretrained on Kinetics 400 dataset, and " - "trained on Something Something V2 dataset." + "trained on Something Something V2 dataset. " + "Published weight is capable of scoring " + "69.6% top1 and 92.7% top5 accuracy on the " + "Kinetics 400 dataset" ), "params": 87_638_984, "official_name": "VideoSwinB", diff --git a/keras_cv/models/classification/video_classifier.py b/keras_cv/models/classification/video_classifier.py index 6313c76977..ff0ac49dce 100644 --- a/keras_cv/models/classification/video_classifier.py +++ b/keras_cv/models/classification/video_classifier.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -51,15 +51,15 @@ class VideoClassifier(Task): ```python input_data = keras.ops.ones(shape=(1, 32, 224, 224, 3)) - # Pretrained classifier (e.g., for imagenet categories) + # Pretrained classifier (e.g., for kinetics categories) model = keras_cv.models.VideoClassifier.from_preset( - "videoswin_tiny_imagenet_classifier", + "videoswin_tiny_kinetics400_classifier", ) output = model(input_data) # Pretrained backbone backbone = keras_cv.models.VideoSwinBackbone.from_preset( - "videoswin_tiny_imagenet", + "videoswin_tiny_kinetics400", ) model = keras_cv.models.VideoClassifier( backbone=backbone, @@ -69,7 +69,9 @@ class VideoClassifier(Task): # Randomly initialized backbone with a custom config model = keras_cv.models.VideoClassifier( - backbone=keras_cv.models.VideoSwinBackbone(), + backbone=keras_cv.models.VideoSwinBackbone( + include_rescaling=True + ), num_classes=400, ) output = model(input_data) diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py index 9914e13433..e27163e01a 100644 --- a/keras_cv/models/classification/video_classifier_presets.py +++ b/keras_cv/models/classification/video_classifier_presets.py @@ -14,34 +14,92 @@ """VideoClassifier Task presets.""" classifier_presets = { - "videoswin_tiny_kinetics_classifier": { + "videoswin_tiny_kinetics400_classifier": { "metadata": { - "description": ("videoswin_tiny_kinetics "), - "params": 25_613_800, + "description": ( + "A tiny Video Swin architecture. " + "It is pretrained on ImageNet 1K dataset, and " + "trained on Kinetics 400 dataset. " + "Published weight is capable of scoring " + "78.8% top1 and 93.6% top5 accuracy on the " + "Kinetics 400 dataset" + ), + "params": 28_158_070, "official_name": "VideoClassifier", "path": "video_classifier", }, }, - "videoswin_small_kinetics_classifier": { + "videoswin_small_kinetics400_classifier": { "metadata": { - "description": ("videoswin_small_kinetics "), - "params": 25_613_800, + "description": ( + "A small Video Swin architecture. " + "It is pretrained on ImageNet 1K dataset, and " + "trained on Kinetics 400 dataset. " + "Published weight is capable of scoring " + "80.6% top1 and 94.5% top5 accuracy on the " + "Kinetics 400 dataset" + ), + "params": 49_816_678, "official_name": "VideoClassifier", "path": "video_classifier", }, }, - "videoswin_base_kinetics_classifier": { + "videoswin_base_kinetics400_classifier": { "metadata": { - "description": ("videoswin_base_kinetics "), - "params": 25_613_800, + "description": ( + "A base Video Swin architecture. " + "It is pretrained on ImageNet 1K dataset, and " + "trained on Kinetics 400 dataset. " + "Published weight is capable of scoring " + "80.6% top1 and 94.6% top5 accuracy on the " + "Kinetics 400 dataset" + ), + "params": 89_065_688, + "official_name": "VideoClassifier", + "path": "video_classifier", + }, + }, + "videoswin_base_kinetics400_imagenet22k": { + "metadata": { + "description": ( + "A base Video Swin architecture. " + "It is pretrained on ImageNet 22K dataset, and " + "trained on Kinetics 400 dataset. " + "Published weight is capable of scoring " + "82.7% top1 and 95.5% top5 accuracy on the " + "Kinetics 400 dataset" + ), + "params": 89_065_688, + "official_name": "VideoClassifier", + "path": "video_classifier", + }, + }, + "videoswin_base_kinetics600_imagenet22k": { + "metadata": { + "description": ( + "A base Video Swin architecture. " + "It is pretrained on ImageNet 22K dataset, and " + "trained on Kinetics 600 dataset. " + "Published weight is capable of scoring " + "84.0% top1 and 96.5% top5 accuracy on the " + "Kinetics 600 dataset" + ), + "params": 89_270_688, "official_name": "VideoClassifier", "path": "video_classifier", }, }, "videoswin_base_something_something_v2_classifier": { "metadata": { - "description": ("videoswin_base_something_something_v2 "), - "params": 25_613_800, + "description": ( + "A base Video Swin architecture. " + "It is pretrained on Kinetics 400 dataset, and " + "trained on Something Something V2 dataset. " + "Published weight is capable of scoring " + "69.6% top1 and 92.7% top5 accuracy on the " + "Kinetics 400 dataset" + ), + "params": 88_834_038, "official_name": "VideoClassifier", "path": "video_classifier", }, diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py index ac188e292c..7e3af58fce 100644 --- a/keras_cv/models/classification/video_classifier_test.py +++ b/keras_cv/models/classification/video_classifier_test.py @@ -24,7 +24,7 @@ from keras_cv.backend import keras from keras_cv.backend import ops from keras_cv.models.backbones.video_swin.video_swin_backbone import ( - VideoSwinBackbone, # TODO: update with aliases + VideoSwinBackbone, # TODO: update with aliases (kaggle handle) ) from keras_cv.models.classification.video_classifier import VideoClassifier from keras_cv.tests.test_case import TestCase From aab1a6c9ff3cae1022372ee35415eabfcd8d82e6 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 11 Mar 2024 00:14:53 +0600 Subject: [PATCH 67/94] fix jax DynamicJaxprTrace issue for --- keras_cv/layers/video_swin_layers.py | 71 ++++++++++++++++------------ 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 98869a76f2..77eb7a77bc 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -393,24 +393,20 @@ def build(self, input_shape): self.norm.build( (batch_size, depth, height // 2, width // 2, 4 * channel) ) - self.built = True - - def call(self, x): - input_shape = ops.shape(x) - height, width = ( - input_shape[2], - input_shape[3], - ) - # padding if needed - paddings = [ + # compute padding if needed + self.pads = [ [0, 0], [0, 0], [0, ops.mod(height, 2)], [0, ops.mod(width, 2)], [0, 0], ] - x = ops.pad(x, paddings) + self.built = True + + def call(self, x): + # padding if needed + x = ops.pad(x, self.pads) x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C @@ -879,6 +875,21 @@ def build(self, input_shape): drop_rate=self.drop_rate, ) self.mlp.build((*input_shape[:-1], self.input_dim)) + + # compute padding if needed. + # pad input feature maps to multiples of window size. + _, depth, height, width, _ = input_shape + pad_l = pad_t = pad_d0 = 0 + self.pad_d1 = ops.mod(-depth + self.window_size[0], self.window_size[0]) + self.pad_b = ops.mod(-height + self.window_size[1], self.window_size[1]) + self.pad_r = ops.mod(-width + self.window_size[2], self.window_size[2]) + self.pads = [ + [0, 0], + [pad_d0, self.pad_d1], + [pad_t, self.pad_b], + [pad_l, self.pad_r], + [0, 0], + ] self.built = True def first_forward(self, x, mask_matrix, training): @@ -890,22 +901,10 @@ def first_forward(self, x, mask_matrix, training): input_shape[3], input_shape[4], ) - window_size, shift_size = self.window_size, self.shift_size x = self.norm1(x) - # pad feature maps to multiples of window size - pad_l = pad_t = pad_d0 = 0 - pad_d1 = ops.mod(-depth + window_size[0], window_size[0]) - pad_b = ops.mod(-height + window_size[1], window_size[1]) - pad_r = ops.mod(-width + window_size[2], window_size[2]) - paddings = [ - [0, 0], - [pad_d0, pad_d1], - [pad_t, pad_b], - [pad_l, pad_r], - [0, 0], - ] - x = ops.pad(x, paddings) + # apply padding if needed. + x = ops.pad(x, self.pads) input_shape = ops.shape(x) depth_pad, height_pad, width_pad = ( @@ -918,7 +917,11 @@ def first_forward(self, x, mask_matrix, training): if self.apply_cyclic_shift: shifted_x = ops.roll( x, - shift=(-shift_size[0], -shift_size[1], -shift_size[2]), + shift=( + -self.shift_size[0], + -self.shift_size[1], + -self.shift_size[2], + ), axis=(1, 2, 3), ) attn_mask = mask_matrix @@ -927,7 +930,7 @@ def first_forward(self, x, mask_matrix, training): attn_mask = None # partition windows - x_windows = window_partition(shifted_x, window_size) + x_windows = window_partition(shifted_x, self.window_size) # get attentions params attn_windows = self.attn(x_windows, mask=attn_mask, training=training) @@ -935,7 +938,7 @@ def first_forward(self, x, mask_matrix, training): # reverse the swin windows shifted_x = window_reverse( attn_windows, - window_size, + self.window_size, batch_size, depth_pad, height_pad, @@ -946,7 +949,11 @@ def first_forward(self, x, mask_matrix, training): if self.apply_cyclic_shift: x = ops.roll( shifted_x, - shift=(shift_size[0], shift_size[1], shift_size[2]), + shift=( + self.shift_size[0], + self.shift_size[1], + self.shift_size[2], + ), axis=(1, 2, 3), ) else: @@ -954,8 +961,10 @@ def first_forward(self, x, mask_matrix, training): # pad if required do_pad = ops.logical_or( - ops.greater(pad_d1, 0), - ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0)), + ops.greater(self.pad_d1, 0), + ops.logical_or( + ops.greater(self.pad_r, 0), ops.greater(self.pad_b, 0) + ), ) x = ops.cond( do_pad, lambda: x[:, :depth, :height, :width, :], lambda: x From ac781085a5bfd73814ffbebeee8d80ed6f00c66a Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 12 Mar 2024 02:32:04 +0600 Subject: [PATCH 68/94] update config of backbone aliases --- .../video_swin/video_swin_aliases.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 482d84c0f7..d30f50315f 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -45,11 +45,17 @@ class VideoSwinTBackbone(VideoSwinBackbone): def __new__( cls, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], include_rescaling=True, **kwargs, ): kwargs.update( { + "embed_dim": embed_dim, + "depths": depths, + "num_heads": num_heads, "include_rescaling": include_rescaling, } ) @@ -74,11 +80,17 @@ def presets_with_weights(cls): class VideoSwinSBackbone(VideoSwinBackbone): def __new__( cls, + embed_dim=96, + depths=[2, 2, 18, 2], + num_heads=[3, 6, 12, 24], include_rescaling=True, **kwargs, ): kwargs.update( { + "embed_dim": embed_dim, + "depths": depths, + "num_heads": num_heads, "include_rescaling": include_rescaling, } ) @@ -103,11 +115,17 @@ def presets_with_weights(cls): class VideoSwinBBackbone(VideoSwinBackbone): def __new__( cls, + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], include_rescaling=True, **kwargs, ): kwargs.update( { + "embed_dim": embed_dim, + "depths": depths, + "num_heads": num_heads, "include_rescaling": include_rescaling, } ) @@ -120,16 +138,6 @@ def presets(cls): "videoswin_base_kinetics400": copy.deepcopy( backbone_presets["videoswin_base_kinetics400"] ), - # TODO: update: should these be here or separate class for each! - # "videoswin_base_kinetics400_imagenet22k": copy.deepcopy( - # backbone_presets["videoswin_base_kinetics400_imagenet22k"] - # ), - # "videoswin_base_kinetics600_imagenet22k": copy.deepcopy( - # backbone_presets["videoswin_base_kinetics600_imagenet22k"] - # ), - # "videoswin_base_something_something_v2": copy.deepcopy( - # backbone_presets["videoswin_base_something_something_v2"] - # ), } @classproperty From 1dbded9d3f4cab48cde6e5bbfa02431afe0a2f8c Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 19 Mar 2024 00:28:11 +0600 Subject: [PATCH 69/94] add can run in mixed precision test --- .../backbones/video_swin/video_swin_backbone_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 9adc543626..5d2ce24c1b 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -59,3 +59,14 @@ def test_fit(self): y = np.zeros((1, 16, 7, 7, 768)) model.compile(optimizer="adam", loss="mse", metrics=["mse"]) model.fit(x, y, epochs=1) + + @pytest.mark.extra_large + def test_can_run_in_mixed_precision(self): + keras.mixed_precision.set_global_policy("mixed_float16") + model = VideoSwinBackbone( + include_rescaling=False, input_shape=(8, 224, 224, 3) + ) + x = np.ones((1, 8, 224, 224, 3)) + y = np.zeros((1, 4, 7, 7, 768)) + model.compile(optimizer="adam", loss="mse", metrics=["mse"]) + model.fit(x, y, epochs=1) From 42003a2c7a784d504f76ef64ac8e1b01f26524a0 Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 19 Mar 2024 00:39:08 +0600 Subject: [PATCH 70/94] add can run on gray video --- .../backbones/video_swin/video_swin_backbone_test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 5d2ce24c1b..01e603e185 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -70,3 +70,15 @@ def test_can_run_in_mixed_precision(self): y = np.zeros((1, 4, 7, 7, 768)) model.compile(optimizer="adam", loss="mse", metrics=["mse"]) model.fit(x, y, epochs=1) + + @pytest.mark.extra_large + def test_can_run_on_gray_video(self): + model = VideoSwinBackbone( + include_rescaling=False, + input_shape=(96, 96, 96, 1), + window_size=[6, 6, 6] + ) + x = np.ones((1, 8, 224, 224, 3)) + y = np.zeros((1, 48, 3, 3, 768)) + model.compile(optimizer="adam", loss="mse", metrics=["mse"]) + model.fit(x, y, epochs=1) From e7313894921e92f09108831306e2550ee565124b Mon Sep 17 00:00:00 2001 From: innat Date: Tue, 19 Mar 2024 00:42:32 +0600 Subject: [PATCH 71/94] minor fix --- .../models/backbones/video_swin/video_swin_aliases.py | 2 +- .../backbones/video_swin/video_swin_backbone_presets.py | 2 +- .../video_swin/video_swin_backbone_presets_test.py | 2 +- .../backbones/video_swin/video_swin_backbone_test.py | 8 ++++---- .../models/classification/video_classifier_presets.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index d30f50315f..84233b0127 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -1,4 +1,4 @@ -# Copyright 202 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py index 5b8472f890..0b507274cc 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py index c8abba5c11..edd30fbba0 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 01e603e185..8146917837 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -74,11 +74,11 @@ def test_can_run_in_mixed_precision(self): @pytest.mark.extra_large def test_can_run_on_gray_video(self): model = VideoSwinBackbone( - include_rescaling=False, - input_shape=(96, 96, 96, 1), - window_size=[6, 6, 6] + include_rescaling=False, + input_shape=(96, 96, 96, 1), + window_size=[6, 6, 6], ) - x = np.ones((1, 8, 224, 224, 3)) + x = np.ones((1, 96, 96, 96, 1)) y = np.zeros((1, 48, 3, 3, 768)) model.compile(optimizer="adam", loss="mse", metrics=["mse"]) model.fit(x, y, epochs=1) diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py index e27163e01a..6e34e43904 100644 --- a/keras_cv/models/classification/video_classifier_presets.py +++ b/keras_cv/models/classification/video_classifier_presets.py @@ -1,4 +1,4 @@ -# Copyright 2023 The KerasCV Authors +# Copyright 2024 The KerasCV Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 77197c2c6041dd524c0993a4b414dc0c8c4a208f Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 20 Mar 2024 12:08:35 +0600 Subject: [PATCH 72/94] specify axis in keras.ops.take to match with tf.gather --- keras_cv/layers/video_swin_layers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 77eb7a77bc..bf2adae8cf 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -545,6 +545,7 @@ def call(self, x, mask=None, training=None): rel_pos_bias = ops.take( self.relative_position_bias_table, self.relative_position_index[:depth, :depth], + axis=0, ) rel_pos_bias = ops.reshape(rel_pos_bias, [depth, depth, -1]) rel_pos_bias = ops.transpose(rel_pos_bias, [2, 0, 1]) From aa2006792ffa925b4cc1f5ef45339d6034838a94 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 01:44:11 +0600 Subject: [PATCH 73/94] specify include rescaling to backbone class --- keras_cv/models/backbones/video_swin/video_swin_backbone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 084b5a074f..7aaee48263 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -91,7 +91,7 @@ class VideoSwinBackbone(Backbone): def __init__( self, *, - include_rescaling, + include_rescaling=False, input_shape=(32, 224, 224, 3), input_tensor=None, embed_dim=96, From 11f33d791374314899c16589d3540285518d3105 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 01:46:19 +0600 Subject: [PATCH 74/94] remove shift size form get config of video basic layer --- keras_cv/layers/video_swin_layers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index bf2adae8cf..acba25b435 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -753,7 +753,6 @@ def get_config(self): "window_size": self.window_size, "num_heads": self.num_heads, "mlp_ratio": self.mlp_ratio, - "shift_size": self.shift_size, "depth": self.depth, "qkv_bias": self.qkv_bias, "qk_scale": self.qk_scale, From a2961b9401ed77521ef7007406941f3189d074aa Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 01:49:22 +0600 Subject: [PATCH 75/94] add support arbitrary input shape --- keras_cv/layers/video_swin_layers.py | 18 ++++++------------ .../video_swin/video_swin_backbone.py | 6 ------ 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index acba25b435..7da58885ce 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -418,6 +418,10 @@ def call(self, x): x = self.reduction(x) return x + + def compute_output_shape(self, input_shape): + batch_size, depth, height, width, _ = input_shape + return (batch_size, depth, height // 2, width // 2, 2 * self.input_dim) def get_config(self): config = super().get_config() @@ -710,18 +714,8 @@ def build(self, input_shape): def compute_output_shape(self, input_shape): if self.downsample is not None: - # TODO: remove tensorflow dependencies. - # GitHub issue: https://github.com/keras-team/keras/issues/19259 # noqa: E501 - output_shape = tf.TensorShape( - [ - input_shape[0], - self.depth_pad, - self.height_pad // 2, - self.width_pad // 2, - 2 * self.input_dim, - ] - ) - return output_shape + input_shape = self.downsample.compute_output_shape(input_shape) + return input_shape return input_shape diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 7aaee48263..beff9cebee 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -123,12 +123,6 @@ def __init__( "Depth, height and width of the video must be specified" " in `input_shape`." ) - if input_spec.shape[-3] != input_spec.shape[-2]: - raise ValueError( - "Input video must be square i.e. the height must" - " be equal to the width in the `input_shape`" - " tuple/tensor." - ) x = input_spec From 49b074a94cd259a20138ed8a4d672f70b7fabe08 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 01:58:42 +0600 Subject: [PATCH 76/94] minor updates to swin layers --- keras_cv/layers/video_swin_layers.py | 52 +++++++------------ .../video_swin/video_swin_backbone.py | 2 +- 2 files changed, 21 insertions(+), 33 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 7da58885ce..56037b0cd6 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -299,19 +299,25 @@ def __init__( self.embed_dim = embed_dim self.norm_layer = norm_layer - def _compute_padding(self, dim, patch_size): + def __compute_padding(self, dim, patch_size): pad_amount = patch_size - (dim % patch_size) return [0, pad_amount if pad_amount != patch_size else 0] def build(self, input_shape): self.pads = [ [0, 0], - self._compute_padding(input_shape[1], self.patch_size[0]), - self._compute_padding(input_shape[2], self.patch_size[1]), - self._compute_padding(input_shape[3], self.patch_size[2]), + self.__compute_padding(input_shape[1], self.patch_size[0]), + self.__compute_padding(input_shape[2], self.patch_size[1]), + self.__compute_padding(input_shape[3], self.patch_size[2]), [0, 0], ] + if self.norm_layer is not None: + self.norm = self.norm_layer( + axis=-1, epsilon=1e-5, name="embed_norm" + ) + self.norm.build((None, None, None, None, self.embed_dim)) + self.proj = layers.Conv3D( self.embed_dim, kernel_size=self.patch_size, @@ -319,13 +325,6 @@ def build(self, input_shape): name="embed_proj", ) self.proj.build((None, None, None, None, input_shape[-1])) - - self.norm = None - if self.norm_layer is not None: - self.norm = self.norm_layer( - axis=-1, epsilon=1e-5, name="embed_norm" - ) - self.norm.build((None, None, None, None, self.embed_dim)) self.built = True def call(self, x): @@ -337,16 +336,6 @@ def call(self, x): return x - def compute_output_shape(self, input_shape): - spatial_dims = [ - (dim - self.patch_size[i]) // self.patch_size[i] + 1 - for i, dim in enumerate(input_shape[1:-1]) - ] - output_shape = ( - (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,) - ) - return output_shape - def get_config(self): config = super().get_config() config.update( @@ -387,7 +376,6 @@ def build(self, input_shape): (batch_size, depth, height // 2, width // 2, 4 * channel) ) - self.norm = None if self.norm_layer is not None: self.norm = self.norm_layer(axis=-1, epsilon=1e-5) self.norm.build( @@ -633,7 +621,7 @@ def __init__( attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=None, - downsample=None, + downsampling_layer=None, **kwargs, ): super().__init__(**kwargs) @@ -649,9 +637,9 @@ def __init__( self.attn_drop_rate = attn_drop_rate self.drop_path_rate = drop_path_rate self.norm_layer = norm_layer - self.downsample = downsample + self.downsampling_layer = downsampling_layer - def _compute_dim_padded(self, input_dim, window_dim_size): + def __compute_dim_padded(self, input_dim, window_dim_size): input_dim = ops.cast(input_dim, dtype="float32") window_dim_size = ops.cast(window_dim_size, dtype="float32") return ops.cast( @@ -662,13 +650,13 @@ def build(self, input_shape): self.window_size, self.shift_size = get_window_size( input_shape[1:-1], self.window_size, self.shift_size ) - self.depth_pad = self._compute_dim_padded( + self.depth_pad = self.__compute_dim_padded( input_shape[1], self.window_size[0] ) - self.height_pad = self._compute_dim_padded( + self.height_pad = self.__compute_dim_padded( input_shape[2], self.window_size[1] ) - self.width_pad = self._compute_dim_padded( + self.width_pad = self.__compute_dim_padded( input_shape[3], self.window_size[2] ) self.attn_mask = compute_mask( @@ -701,8 +689,8 @@ def build(self, input_shape): for i in range(self.depth) ] - if self.downsample is not None: - self.downsample = self.downsample( + if self.downsampling_layer is not None: + self.downsample = self.downsampling_layer( input_dim=self.input_dim, norm_layer=self.norm_layer ) self.downsample.build(input_shape) @@ -713,7 +701,7 @@ def build(self, input_shape): self.built = True def compute_output_shape(self, input_shape): - if self.downsample is not None: + if self.downsampling_layer is not None: input_shape = self.downsample.compute_output_shape(input_shape) return input_shape @@ -734,7 +722,7 @@ def call(self, x, training=None): x = ops.reshape(x, [batch_size, depth, height, width, channel]) - if self.downsample is not None: + if self.downsampling_layer is not None: x = self.downsample(x) return x diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index beff9cebee..0949d76071 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -166,7 +166,7 @@ def __init__( attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])], norm_layer=norm_layer, - downsample=( + downsampling_layer=( VideoSwinPatchMerging if (i < num_layers - 1) else None ), name=f"videoswin_basic_layer_{i + 1}", From 204e4b1cfb338eacc55dc922f8ec803b4dc17f4c Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 02:03:38 +0600 Subject: [PATCH 77/94] test method update for swin layers --- keras_cv/layers/video_swin_layers_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py index 4e72cb5e9a..0e9e71d00f 100644 --- a/keras_cv/layers/video_swin_layers_test.py +++ b/keras_cv/layers/video_swin_layers_test.py @@ -25,9 +25,9 @@ def test_patch_embedding_compute_output_shape(self): patch_embedding_model = VideoSwinPatchingAndEmbedding( patch_size=(2, 4, 4), embed_dim=96, norm_layer=None ) - input_shape = (None, 16, 32, 32, 3) - output_shape = patch_embedding_model.compute_output_shape(input_shape) - expected_output_shape = (None, 8, 8, 8, 96) + input_array = ops.ones(shape=(1, 16, 32, 32, 3)) + output_shape = patch_embedding_model(input_array).shape + expected_output_shape = (1, 8, 8, 8, 96) self.assertEqual(output_shape, expected_output_shape) def test_patch_embedding_get_config(self): From 251495b8469ad1e5c210a918b7ca3ce3356caa5a Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 02:09:15 +0600 Subject: [PATCH 78/94] update test method to swin backbone --- .../backbones/video_swin/video_swin_backbone_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 8146917837..1032f2d1fe 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -82,3 +82,13 @@ def test_can_run_on_gray_video(self): y = np.zeros((1, 48, 3, 3, 768)) model.compile(optimizer="adam", loss="mse", metrics=["mse"]) model.fit(x, y, epochs=1) + + @pytest.mark.extra_large + def test_can_run_non_square_shape(self): + input_batch = np.ones(shape=(2, 8, 224, 256, 3)) + model = VideoSwinBackbone( + input_shape=(8, 224, 256, 3), + include_rescaling=False, + num_classes=10, + ) + model(input_batch) From 599d48129a81a6d90ca43290a87642a1747b4b07 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 02:12:12 +0600 Subject: [PATCH 79/94] remove unsed code --- keras_cv/layers/video_swin_layers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 56037b0cd6..9821a725b7 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -13,7 +13,6 @@ # limitations under the License. import numpy as np -import tensorflow as tf from keras import layers from keras_cv.api_export import keras_cv_export @@ -406,7 +405,7 @@ def call(self, x): x = self.reduction(x) return x - + def compute_output_shape(self, input_shape): batch_size, depth, height, width, _ = input_shape return (batch_size, depth, height // 2, width // 2, 2 * self.input_dim) From a849b387a3210cd74ae1df9a08e79d29017a6230 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 02:29:48 +0600 Subject: [PATCH 80/94] bug fix in call method of patch embed layer --- keras_cv/layers/video_swin_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 9821a725b7..c5edf7128e 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -330,7 +330,7 @@ def call(self, x): x = ops.pad(x, self.pads) x = self.proj(x) - if self.norm is not None: + if self.norm_layer is not None: x = self.norm(x) return x From f611b0e2d5fbb386f45fc4aa7f4db24d24a3b39f Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 02:50:21 +0600 Subject: [PATCH 81/94] fix typo in patch merging layer --- keras_cv/layers/video_swin_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index c5edf7128e..12b94ec198 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -400,7 +400,7 @@ def call(self, x): x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C x = ops.concatenate([x0, x1, x2, x3], axis=-1) # B D H/2 W/2 4*C - if self.norm is not None: + if self.norm_layer is not None: x = self.norm(x) x = self.reduction(x) From b7d26e4ddf912531e0a4b12085bb0cb8b6dd5885 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 04:43:19 +0600 Subject: [PATCH 82/94] minor fix --- keras_cv/models/backbones/video_swin/video_swin_backbone_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py index 1032f2d1fe..0e049ea395 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py @@ -89,6 +89,5 @@ def test_can_run_non_square_shape(self): model = VideoSwinBackbone( input_shape=(8, 224, 256, 3), include_rescaling=False, - num_classes=10, ) model(input_batch) From e3e02dc3fdc474bc25845ef940c42183cd05b252 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 22:09:38 +0600 Subject: [PATCH 83/94] fix keras.ops.cond issue with jax --- keras_cv/layers/video_swin_layers.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 12b94ec198..23fb3e63e0 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -871,6 +871,9 @@ def build(self, input_shape): [pad_l, self.pad_r], [0, 0], ] + self.do_pad = any( + value > 0 for value in (self.pad_d1, self.pad_r, self.pad_b) + ) self.built = True def first_forward(self, x, mask_matrix, training): @@ -941,15 +944,8 @@ def first_forward(self, x, mask_matrix, training): x = shifted_x # pad if required - do_pad = ops.logical_or( - ops.greater(self.pad_d1, 0), - ops.logical_or( - ops.greater(self.pad_r, 0), ops.greater(self.pad_b, 0) - ), - ) - x = ops.cond( - do_pad, lambda: x[:, :depth, :height, :width, :], lambda: x - ) + if self.do_pad: + return x[:, :depth, :height, :width, :] return x From a626b1f680a4f8e5a7c8e5a45417c81c7082bd96 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 23:07:50 +0600 Subject: [PATCH 84/94] no test for jit compile in torch --- keras_cv/models/classification/video_classifier_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py index 7e3af58fce..94843aac74 100644 --- a/keras_cv/models/classification/video_classifier_test.py +++ b/keras_cv/models/classification/video_classifier_test.py @@ -51,6 +51,9 @@ def test_valid_call(self): ) @pytest.mark.large # Fit is slow, so mark these large. def test_classifier_fit(self, jit_compile): + if jit_compile and keras.backend.backend() == "torch": + self.skipTest("TODO: Torch Backend `jit_compile` fails on GPU.") + self.supports_jit = False model = VideoClassifier( backbone=VideoSwinBackbone( input_shape=(8, 224, 224, 3), include_rescaling=True From c484445745a5aaf96f87e328a55ac72a5154b7b2 Mon Sep 17 00:00:00 2001 From: innat Date: Mon, 25 Mar 2024 23:25:47 +0600 Subject: [PATCH 85/94] reduce tensor size for forward test --- keras_cv/models/classification/video_classifier_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py index 94843aac74..b3d658dbef 100644 --- a/keras_cv/models/classification/video_classifier_test.py +++ b/keras_cv/models/classification/video_classifier_test.py @@ -72,6 +72,7 @@ def test_classifier_fit(self, jit_compile): ("avg_pooling", "avg"), ("max_pooling", "max") ) def test_pooling_arg_call(self, pooling): + input_batch = np.ones(shape=(2, 8, 224, 224, 3)) model = VideoClassifier( backbone=VideoSwinBackbone( input_shape=(8, 224, 224, 3), include_rescaling=True @@ -79,17 +80,18 @@ def test_pooling_arg_call(self, pooling): num_classes=10, pooling=pooling, ) - model(self.input_batch) + model(input_batch) @pytest.mark.large # Saving is slow, so mark these large. def test_saved_model(self): + input_batch = np.ones(shape=(2, 8, 224, 224, 3)) model = VideoClassifier( backbone=VideoSwinBackbone( input_shape=(8, 224, 224, 3), include_rescaling=False ), num_classes=10, ) - model_output = model(self.input_batch) + model_output = model(input_batch) save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras") model.save(save_path) restored_model = keras.models.load_model(save_path) @@ -98,7 +100,7 @@ def test_saved_model(self): self.assertIsInstance(restored_model, VideoClassifier) # Check that output matches. - restored_output = restored_model(self.input_batch) + restored_output = restored_model(input_batch) self.assertAllClose( ops.convert_to_numpy(model_output), ops.convert_to_numpy(restored_output), From 45945c96933141766ecb62136bfd056d5c7966b6 Mon Sep 17 00:00:00 2001 From: innat Date: Thu, 28 Mar 2024 17:02:26 +0600 Subject: [PATCH 86/94] minor fix --- keras_cv/layers/video_swin_layers.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 23fb3e63e0..e6d930f91a 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -820,10 +820,7 @@ def build(self, input_shape): self.window_size, self.shift_size = get_window_size( input_shape[1:-1], self.window_size, self.shift_size ) - - self.apply_cyclic_shift = False - if any(i > 0 for i in self.shift_size): - self.apply_cyclic_shift = True + self.apply_cyclic_shift = any(i > 0 for i in self.shift_size) # layers self.drop_path = ( @@ -871,7 +868,7 @@ def build(self, input_shape): [pad_l, self.pad_r], [0, 0], ] - self.do_pad = any( + self.apply_pad = any( value > 0 for value in (self.pad_d1, self.pad_r, self.pad_b) ) self.built = True @@ -944,7 +941,7 @@ def first_forward(self, x, mask_matrix, training): x = shifted_x # pad if required - if self.do_pad: + if self.apply_pad: return x[:, :depth, :height, :width, :] return x From f866d12cdcae6ff38990d2e1c5b40debe8c1e05b Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 31 Mar 2024 12:06:51 +0600 Subject: [PATCH 87/94] remove kcv export decorator --- keras_cv/layers/video_swin_layers.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index e6d930f91a..0e89ab6e27 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -269,9 +269,6 @@ def get_config(self): return config -@keras_cv_export( - "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers" -) class VideoSwinPatchingAndEmbedding(keras.Model): """Video to Patch Embedding layer for Video Swin Transformer models. @@ -745,9 +742,6 @@ def get_config(self): return config -@keras_cv_export( - "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers" -) class VideoSwinTransformerBlock(keras.Model): """Video Swin Transformer Block. From bfb62a47cc1147097fa5fde53abcb1c428796988 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 31 Mar 2024 12:10:50 +0600 Subject: [PATCH 88/94] update keras.Layer import --- keras_cv/layers/video_swin_layers.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 0e89ab6e27..8f58f230b2 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -13,9 +13,7 @@ # limitations under the License. import numpy as np -from keras import layers -from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops from keras_cv.layers import DropPath @@ -214,7 +212,7 @@ def compute_mask(depth, height, width, window_size, shift_size): return attn_mask -class MLP(layers.Layer): +class MLP(keras.Layer): """A Multilayer perceptron(MLP) layer. Args: @@ -343,7 +341,7 @@ def get_config(self): return config -class VideoSwinPatchMerging(layers.Layer): +class VideoSwinPatchMerging(keras.Layer): """Patch Merging Layer in Video Swin Transformer models. This layer performs a downsampling step by merging four neighboring patches From 57f0012786e2e92314a3a45ff56bc02b3e495d58 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 31 Mar 2024 13:34:00 +0600 Subject: [PATCH 89/94] remove unused layer import --- keras_cv/layers/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index ae4f6007f5..0bfa2aa8ec 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -135,9 +135,6 @@ ) from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling from keras_cv.layers.transformer_encoder import TransformerEncoder -from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer -from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding -from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging from keras_cv.layers.vit_det_layers import AddRelativePositionalEmbedding from keras_cv.layers.vit_det_layers import MultiHeadAttentionWithRelativePE from keras_cv.layers.vit_det_layers import ViTDetPatchingAndEmbedding From 7602052986cf38508d093603fd9c702ce269a5e4 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 31 Mar 2024 13:42:19 +0600 Subject: [PATCH 90/94] replace keras.layers instead of layers --- keras_cv/layers/video_swin_layers.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index 8f58f230b2..a265052c23 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -236,10 +236,10 @@ def __init__( self.hidden_dim = hidden_dim self._activation_identifier = activation self.drop_rate = drop_rate - self.activation = layers.Activation(self._activation_identifier) - self.fc1 = layers.Dense(self.hidden_dim) - self.fc2 = layers.Dense(self.output_dim) - self.dropout = layers.Dropout(self.drop_rate) + self.activation = keras.layers.Activation(self._activation_identifier) + self.fc1 = keras.layers.Dense(self.hidden_dim) + self.fc2 = keras.layers.Dense(self.output_dim) + self.dropout = keras.layers.Dropout(self.drop_rate) def build(self, input_shape): self.fc1.build(input_shape) @@ -312,7 +312,7 @@ def build(self, input_shape): ) self.norm.build((None, None, None, None, self.embed_dim)) - self.proj = layers.Conv3D( + self.proj = keras.layers.Conv3D( self.embed_dim, kernel_size=self.patch_size, strides=self.patch_size, @@ -365,7 +365,7 @@ def __init__(self, input_dim, norm_layer=None, **kwargs): def build(self, input_shape): batch_size, depth, height, width, channel = input_shape - self.reduction = layers.Dense(2 * self.input_dim, use_bias=False) + self.reduction = keras.layers.Dense(2 * self.input_dim, use_bias=False) self.reduction.build( (batch_size, depth, height // 2, width // 2, 4 * channel) ) @@ -500,10 +500,12 @@ def build(self, input_shape): ) # layers - self.qkv = layers.Dense(self.input_dim * 3, use_bias=self.qkv_bias) - self.attn_drop = layers.Dropout(self.attn_drop_rate) - self.proj = layers.Dense(self.input_dim) - self.proj_drop = layers.Dropout(self.proj_drop_rate) + self.qkv = keras.layers.Dense( + self.input_dim * 3, use_bias=self.qkv_bias + ) + self.attn_drop = keras.layers.Dropout(self.attn_drop_rate) + self.proj = keras.layers.Dense(self.input_dim) + self.proj_drop = keras.layers.Dropout(self.proj_drop_rate) self.qkv.build(input_shape) self.proj.build(input_shape) self.built = True @@ -779,7 +781,7 @@ def __init__( attn_drop_rate=0.0, drop_path_rate=0.0, activation="gelu", - norm_layer=layers.LayerNormalization, + norm_layer=keras.layers.LayerNormalization, **kwargs, ): super().__init__(**kwargs) @@ -818,7 +820,7 @@ def build(self, input_shape): self.drop_path = ( DropPath(self.drop_path_rate) if self.drop_path_rate > 0.0 - else layers.Identity() + else keras.layers.Identity() ) self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05) From 837286dc3098de8d6950974b34f41cfdf5a0e4c2 Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 31 Mar 2024 14:16:23 +0600 Subject: [PATCH 91/94] update keras.Layer to keras.layers.Layer for keras2 --- keras_cv/layers/video_swin_layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py index a265052c23..7aad7391ec 100644 --- a/keras_cv/layers/video_swin_layers.py +++ b/keras_cv/layers/video_swin_layers.py @@ -212,7 +212,7 @@ def compute_mask(depth, height, width, window_size, shift_size): return attn_mask -class MLP(keras.Layer): +class MLP(keras.layers.Layer): """A Multilayer perceptron(MLP) layer. Args: @@ -341,7 +341,7 @@ def get_config(self): return config -class VideoSwinPatchMerging(keras.Layer): +class VideoSwinPatchMerging(keras.layers.Layer): """Patch Merging Layer in Video Swin Transformer models. This layer performs a downsampling step by merging four neighboring patches From 6d44ecac284848a1bc17be6f3d8013d0278ffbbc Mon Sep 17 00:00:00 2001 From: innat Date: Sun, 31 Mar 2024 17:36:20 +0600 Subject: [PATCH 92/94] add window_size param to aliases --- keras_cv/models/backbones/video_swin/video_swin_aliases.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py index 84233b0127..161ba0bbb4 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py +++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py @@ -48,6 +48,7 @@ def __new__( embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], + window_size=[8, 7, 7], include_rescaling=True, **kwargs, ): @@ -56,6 +57,7 @@ def __new__( "embed_dim": embed_dim, "depths": depths, "num_heads": num_heads, + "window_size": window_size, "include_rescaling": include_rescaling, } ) @@ -83,6 +85,7 @@ def __new__( embed_dim=96, depths=[2, 2, 18, 2], num_heads=[3, 6, 12, 24], + window_size=[8, 7, 7], include_rescaling=True, **kwargs, ): @@ -91,6 +94,7 @@ def __new__( "embed_dim": embed_dim, "depths": depths, "num_heads": num_heads, + "window_size": window_size, "include_rescaling": include_rescaling, } ) @@ -118,6 +122,7 @@ def __new__( embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], + window_size=[8, 7, 7], include_rescaling=True, **kwargs, ): @@ -126,6 +131,7 @@ def __new__( "embed_dim": embed_dim, "depths": depths, "num_heads": num_heads, + "window_size": window_size, "include_rescaling": include_rescaling, } ) From f5dce04d92cfeb20543a0bb69a549f0303473a42 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 3 Apr 2024 05:30:01 +0600 Subject: [PATCH 93/94] move vide swin layer to model specific directory --- keras_cv/models/backbones/video_swin/video_swin_backbone.py | 6 +++--- .../backbones/video_swin}/video_swin_layers.py | 0 .../backbones/video_swin}/video_swin_layers_test.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) rename keras_cv/{layers => models/backbones/video_swin}/video_swin_layers.py (100%) rename keras_cv/{layers => models/backbones/video_swin}/video_swin_layers_test.py (91%) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index 0949d76071..c456e4fb74 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -19,9 +19,9 @@ from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras -from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer -from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding -from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging +from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinBasicLayer +from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding +from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import ( # noqa: E501 diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/models/backbones/video_swin/video_swin_layers.py similarity index 100% rename from keras_cv/layers/video_swin_layers.py rename to keras_cv/models/backbones/video_swin/video_swin_layers.py diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py similarity index 91% rename from keras_cv/layers/video_swin_layers_test.py rename to keras_cv/models/backbones/video_swin/video_swin_layers_test.py index 0e9e71d00f..b8dc784ed1 100644 --- a/keras_cv/layers/video_swin_layers_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py @@ -14,9 +14,9 @@ from keras_cv.backend import ops -from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding -from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging -from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention +from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding +from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging +from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinWindowAttention from keras_cv.tests.test_case import TestCase From 0ba9fdf79f75c4b87758e12309549a438a32e1d9 Mon Sep 17 00:00:00 2001 From: innat Date: Wed, 3 Apr 2024 05:38:46 +0600 Subject: [PATCH 94/94] minor fix --- .../backbones/video_swin/video_swin_backbone.py | 12 +++++++++--- .../backbones/video_swin/video_swin_layers_test.py | 12 +++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py index c456e4fb74..9bb62eb385 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py +++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py @@ -19,9 +19,6 @@ from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras -from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinBasicLayer -from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding -from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import ( # noqa: E501 @@ -30,6 +27,15 @@ from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import ( # noqa: E501 backbone_presets_with_weights, ) +from keras_cv.models.backbones.video_swin.video_swin_layers import ( + VideoSwinBasicLayer, +) +from keras_cv.models.backbones.video_swin.video_swin_layers import ( + VideoSwinPatchingAndEmbedding, +) +from keras_cv.models.backbones.video_swin.video_swin_layers import ( + VideoSwinPatchMerging, +) from keras_cv.utils.python_utils import classproperty diff --git a/keras_cv/models/backbones/video_swin/video_swin_layers_test.py b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py index b8dc784ed1..c0b540d1c0 100644 --- a/keras_cv/models/backbones/video_swin/video_swin_layers_test.py +++ b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py @@ -14,9 +14,15 @@ from keras_cv.backend import ops -from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding -from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging -from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinWindowAttention +from keras_cv.models.backbones.video_swin.video_swin_layers import ( + VideoSwinPatchingAndEmbedding, +) +from keras_cv.models.backbones.video_swin.video_swin_layers import ( + VideoSwinPatchMerging, +) +from keras_cv.models.backbones.video_swin.video_swin_layers import ( + VideoSwinWindowAttention, +) from keras_cv.tests.test_case import TestCase