From f961e75cd755d3c6536359976386f0a950148654 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:02:39 +0600
Subject: [PATCH 01/94] init video swin

---
 keras_cv/layers/video_swin_transformer_layers.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 keras_cv/layers/video_swin_transformer_layers.py

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
new file mode 100644
index 0000000000..1756010b15
--- /dev/null
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file

From 578205ad780a89f06890485e10bec4e7706a7f91 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:08:46 +0600
Subject: [PATCH 02/94] add: 3d window size computation

---
 .../layers/video_swin_transformer_layers.py   | 144 +++++++++++++++++-
 1 file changed, 143 insertions(+), 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index 1756010b15..0b81362451 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -10,4 +10,146 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
+
+from functools import partial
+
+import numpy as np
+from keras import layers
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.layers import DropPath
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (batch_size, depth, height, width, channel)
+        window_size (tuple[int]): window size
+
+    Returns:
+        windows: (batch_size*num_windows, window_size*window_size, channel)
+    """  # noqa: E501
+
+    input_shape = ops.shape(x)
+    batch_size, depth, height, width, channel = (
+        input_shape[0],
+        input_shape[1],
+        input_shape[2],
+        input_shape[3],
+        input_shape[4],
+    )
+
+    x = ops.reshape(
+        x,
+        [
+            batch_size,
+            depth // window_size[0],
+            window_size[0],
+            height // window_size[1],
+            window_size[1],
+            width // window_size[2],
+            window_size[2],
+            channel,
+        ],
+    )
+
+    x = ops.transpose(x, [0, 1, 3, 5, 2, 4, 6, 7])
+    windows = ops.reshape(
+        x, [-1, window_size[0] * window_size[1] * window_size[2], channel]
+    )
+
+    return windows
+
+
+def window_reverse(windows, window_size, batch_size, depth, height, width):
+    """
+    Args:
+        windows: (batch_size*num_windows, window_size, window_size, channel)
+        window_size (tuple[int]): Window size
+        height (int): Height of image
+        width (int): Width of image
+
+    Returns:
+        x: (batch_size, depth, height, width, channel)
+    """  # noqa: E501
+    x = ops.reshape(
+        windows,
+        [
+            batch_size,
+            depth // window_size[0],
+            height // window_size[1],
+            width // window_size[2],
+            window_size[0],
+            window_size[1],
+            window_size[2],
+            -1,
+        ],
+    )
+    x = ops.transpose(x, [0, 1, 4, 2, 5, 3, 6, 7])
+    x = ops.reshape(x, [batch_size, depth, height, width, -1])
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    """Computing window size based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+
+     Args:
+        x_size: input size.
+        window_size: local window size.
+        shift_size: window shifting size.
+
+    Returns:
+        x: window_size, shift_size
+    """  # noqa: E501
+
+    use_window_size = list(window_size)
+
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+def compute_mask(depth, height, width, window_size, shift_size):
+    img_mask = np.zeros((1, depth, height, width, 1))
+    cnt = 0
+    for d in (
+        slice(-window_size[0]),
+        slice(-window_size[0], -shift_size[0]),
+        slice(-shift_size[0], None),
+    ):
+        for h in (
+            slice(-window_size[1]),
+            slice(-window_size[1], -shift_size[1]),
+            slice(-shift_size[1], None),
+        ):
+            for w in (
+                slice(-window_size[2]),
+                slice(-window_size[2], -shift_size[2]),
+                slice(-shift_size[2], None),
+            ):
+                img_mask[:, d, h, w, :] = cnt
+                cnt = cnt + 1
+    mask_windows = window_partition(img_mask, window_size)
+    mask_windows = ops.squeeze(mask_windows, axis=-1)
+    attn_mask = ops.expand_dims(mask_windows, axis=1) - ops.expand_dims(
+        mask_windows, axis=2
+    )
+    attn_mask = ops.where(attn_mask != 0, -100.0, attn_mask)
+    attn_mask = ops.where(attn_mask == 0, 0.0, attn_mask)
+    return attn_mask
\ No newline at end of file

From 9817025257dc2b30ea1e7b90c3ed2d791bc6bd8f Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:10:26 +0600
Subject: [PATCH 03/94] add: mlp layer

---
 .../layers/video_swin_transformer_layers.py   | 62 ++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index 0b81362451..dfaac439ce 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -152,4 +152,64 @@ def compute_mask(depth, height, width, window_size, shift_size):
     )
     attn_mask = ops.where(attn_mask != 0, -100.0, attn_mask)
     attn_mask = ops.where(attn_mask == 0, 0.0, attn_mask)
-    return attn_mask
\ No newline at end of file
+    return attn_mask
+
+
+class MLP(layers.Layer):
+    """A Multilayer perceptron(MLP) layer.
+
+    Args:
+        hidden_dim (int): The number of units in the hidden layer.
+        output_dim (int): The number of units in the output layer.
+        drop_rate  (float): Float between 0 and 1. Fraction of the
+            input units to drop.
+        activation (str): Activation to use in the hidden layers.
+            Default is `"gelu"`.
+
+    References:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
+    """  # noqa: E501
+    
+    def __init__(
+        self,
+        hidden_dim,
+        output_dim,
+        drop_rate=0.0,
+        activation='gelu',
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self._activation_identifier = activation
+        self.drop_rate = drop_rate
+        self.activation = layers.Activation(self._activation_identifier)
+        self.fc1 = layers.Dense(self.hidden_dim)
+        self.fc2 = layers.Dense(self.output_dim)
+        self.dropout = layers.Dropout(self.drop_rate)
+        
+    def build(self, input_shape):
+        self.fc1.build(input_shape)
+        self.fc2.build((*input_shape[1:-1], self.hidden_dim))
+        self.built = True
+
+    def call(self, x, training=None):
+        x = self.fc1(x)
+        x = self.activation(x)
+        x = self.dropout(x, training=training)
+        x = self.fc2(x)
+        x = self.dropout(x, training=training)
+        return x
+    
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "output_dim": self.output_dim, 
+                "hidden_dim": self.hidden_dim,
+                "drop_rate": self.drop_rate,
+                'activation': self._activation_identifier
+            }
+        )
+        return config
\ No newline at end of file

From 3343db1a8f17a9206d715c685c615a0b788dbbed Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:18:17 +0600
Subject: [PATCH 04/94] add: patch embedding layer

---
 .../layers/video_swin_transformer_layers.py   | 88 ++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index dfaac439ce..08a3bef735 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -170,7 +170,7 @@ class MLP(layers.Layer):
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
     """  # noqa: E501
-    
+
     def __init__(
         self,
         hidden_dim,
@@ -212,4 +212,90 @@ def get_config(self):
                 'activation': self._activation_identifier
             }
         )
+        return config
+    
+
+class PatchEmbed3D(keras.Model):
+    """Video to Patch Embedding layer.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (keras.layers, optional): Normalization layer. Default: None
+
+    References:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
+    """  # noqa: E501
+        
+    def __init__(
+        self, 
+        patch_size=(2, 4, 4), 
+        embed_dim=96, 
+        norm_layer=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.norm_layer = norm_layer
+
+    def _compute_padding(self, dim, patch_size):
+        pad_amount = patch_size - (dim % patch_size)
+        return [
+            0, pad_amount if pad_amount != patch_size else 0
+        ]
+
+    def build(self, input_shape):
+        self.pads = [
+            [0, 0], 
+            self._compute_padding(input_shape[1], self.patch_size[0]),
+            self._compute_padding(input_shape[2], self.patch_size[1]),
+            self._compute_padding(input_shape[3], self.patch_size[2]),
+            [0, 0]   
+        ]
+        
+        self.proj = layers.Conv3D(
+            self.embed_dim, 
+            kernel_size=self.patch_size,
+            strides=self.patch_size, 
+            name='embed_proj'
+        )
+        self.proj.build((None, None, None, None, input_shape[-1]))
+        
+        self.norm = None
+        if self.norm_layer is not None:
+            self.norm = self.norm_layer(
+                axis=-1, epsilon=1e-5, name='embed_norm'
+            )
+            self.norm.build(
+                (None, None, None, None, self.embed_dim)
+            )
+        self.built = True
+
+    def call(self, x):
+        x = ops.pad(x, self.pads)
+        x = self.proj(x)
+        
+        if self.norm is not None:
+            x = self.norm(x) 
+            
+        return x
+    
+    def compute_output_shape(self, input_shape):
+        spatial_dims = [
+            (dim - self.patch_size[i]) // self.patch_size[i] + 1
+            for i, dim in enumerate(input_shape[1:-1])
+        ]
+        output_shape = (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,)
+        return output_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "patch_size": self.patch_size,
+                "embed_dim": self.embed_dim,
+            }
+        )
         return config
\ No newline at end of file

From 7ab5cab4a78c3e3046246bff2db09b77c963bf75 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:22:42 +0600
Subject: [PATCH 05/94] add: patch merging layer

---
 .../layers/video_swin_transformer_layers.py   | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index 08a3bef735..66656a7f06 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -298,4 +298,76 @@ def get_config(self):
                 "embed_dim": self.embed_dim,
             }
         )
+        return config
+    
+
+class PatchMerging(layers.Layer):
+    """Patch Merging Layer.
+
+    Args:
+        input_dim (int): Number of input channels.
+        norm_layer (keras.layers, optional): Normalization layer. 
+            Default: LayerNormalization
+
+    References:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
+    """  # noqa: E501
+        
+    def __init__(
+        self, 
+        input_dim, 
+        norm_layer=layers.LayerNormalization,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.norm_layer = norm_layer
+
+    def build(self, input_shape):
+        batch_size, depth, height, width, channel = input_shape
+        self.reduction = layers.Dense(2 * self.input_dim, use_bias=False)
+        self.reduction.build((batch_size, depth, height // 2, width // 2, 4 * channel))
+        self.norm = self.norm_layer(axis=-1, epsilon=1e-5)
+        self.norm.build((batch_size, depth, height // 2, width // 2, 4 * channel))
+        self.built=True
+        
+    def call(self, x):
+        """ The call function.
+
+        Args:
+            x: Input feature, shape: (batch_size, depth, height, width, channel).
+        """
+        input_shape = ops.shape(x)
+        height, width = (
+            input_shape[2],
+            input_shape[3],
+        )
+
+        # padding if needed
+        paddings = [
+            [0, 0], 
+            [0, 0], 
+            [0, ops.mod(height, 2)], 
+            [0, ops.mod(width, 2)], 
+            [0, 0]
+        ]
+        x = ops.pad(x, paddings)
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = ops.concatenate([x0, x1, x2, x3], axis=-1)  # B D H/2 W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "input_dim": self.input_dim,
+            }
+        )
         return config
\ No newline at end of file

From f70a61bb3546b77d0056273faf076c2133b3ca69 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:25:34 +0600
Subject: [PATCH 06/94] add: window attention layer

---
 .../layers/video_swin_transformer_layers.py   | 138 ++++++++++++++++++
 1 file changed, 138 insertions(+)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index 66656a7f06..9ea37900a6 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -370,4 +370,142 @@ def get_config(self):
                 "input_dim": self.input_dim,
             }
         )
+        return config
+    
+
+class WindowAttention3D(keras.Model):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop_rate (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.0
+
+    References:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
+    """  # noqa: E501
+        
+    def __init__(
+        self, 
+        input_dim, 
+        window_size, 
+        num_heads, 
+        qkv_bias=True, 
+        qk_scale=None, 
+        attn_drop_rate=0., 
+        proj_drop_rate=0.,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # variables
+        self.input_dim = input_dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = input_dim // num_heads
+        self.qk_scale = qk_scale
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv_bias = qkv_bias
+        self.attn_drop_rate = attn_drop_rate
+        self.proj_drop_rate = proj_drop_rate
+
+    def get_relative_position_index(self, window_depth, window_height, window_width):
+        y_y, z_z, x_x = ops.meshgrid(
+            ops.arange(window_width), ops.arange(window_depth), ops.arange(window_height)
+        )
+        coords = ops.stack([z_z, y_y, x_x], axis=0)
+        coords_flatten = ops.reshape(coords, [3, -1])
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = ops.transpose(relative_coords, axes=[1, 2, 0])
+        z_z = (relative_coords[:, :, 0] + window_depth  - 1) * (2 * window_height - 1) * (2 * window_width - 1)
+        x_x = (relative_coords[:, :, 1] + window_height - 1) * (2 * window_width - 1)
+        y_y = (relative_coords[:, :, 2] + window_width  - 1)
+        relative_coords = ops.stack([z_z, x_x, y_y], axis=-1)
+        return ops.sum(relative_coords, axis=-1)
+
+    def build(self, input_shape):
+        self.relative_position_bias_table = self.add_weight(
+            shape=(
+                (2 * self.window_size[0] - 1) * 
+                (2 * self.window_size[1] - 1) * 
+                (2 * self.window_size[2] - 1),
+                self.num_heads,
+            ),
+            initializer="zeros",
+            trainable=True,
+            name="relative_position_bias_table",
+        )
+        self.relative_position_index = self.get_relative_position_index(
+            self.window_size[0], self.window_size[1], self.window_size[2]
+        )
+        
+        # layers
+        self.qkv = layers.Dense(self.input_dim * 3, use_bias=self.qkv_bias)
+        self.attn_drop = layers.Dropout(self.attn_drop_rate)
+        self.proj = layers.Dense(self.input_dim)
+        self.proj_drop = layers.Dropout(self.proj_drop_rate)
+        self.qkv.build(input_shape)
+        self.proj.build(input_shape) 
+        self.built = True
+
+    def call(self, x, mask=None, training=None):
+        input_shape = ops.shape(x)
+        batch_size, depth, channel = (
+            input_shape[0],
+            input_shape[1],
+            input_shape[2],
+        )
+        
+        qkv = self.qkv(x)
+        qkv = ops.reshape(qkv, [batch_size, depth, 3, self.num_heads, channel // self.num_heads])
+        qkv = ops.transpose(qkv, [2, 0, 3, 1, 4])
+        q, k, v = ops.split(qkv, 3, axis=0)
+
+        q = ops.squeeze(q, axis=0) * self.scale
+        k = ops.squeeze(k, axis=0)
+        v = ops.squeeze(v, axis=0)
+        attn = ops.matmul(q, ops.transpose(k, [0, 1, 3, 2]))
+        
+        rel_pos_bias = ops.take(
+            self.relative_position_bias_table, self.relative_position_index[:depth, :depth]
+        )
+        rel_pos_bias = ops.reshape(rel_pos_bias, [depth, depth, -1])
+        rel_pos_bias = ops.transpose(rel_pos_bias, [2, 0, 1])
+        attn = attn + rel_pos_bias[None, ...]
+  
+        if mask is not None:
+            mask_size = ops.shape(mask)[0]
+            mask = ops.cast(mask, dtype=attn.dtype)
+            attn = ops.reshape(
+                attn, 
+                [batch_size // mask_size, mask_size, self.num_heads, depth, depth]
+            ) + mask[:, None, :, :]
+            attn = ops.reshape(attn, [-1, self.num_heads, depth, depth])
+
+        attn = keras.activations.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn, training=training)
+        x = ops.matmul(attn, v)
+        x = ops.transpose(x, [0, 2, 1, 3])
+        x = ops.reshape(x, [batch_size, depth, channel])
+        x = self.proj(x)
+        x = self.proj_drop(x, training=training)
+        return x
+    
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "input_dim": self.input_dim,
+                "window_size": self.window_size,
+                "num_heads": self.num_heads,
+                "qk_scale": self.qk_scale,
+                "qkv_bias": self.qkv_bias,
+                "attn_drop_rate": self.attn_drop_rate,
+                "proj_drop_rate": self.proj_drop_rate,
+            }
+        )
         return config
\ No newline at end of file

From 5472fc655c18a681573aaeb23a65e357a97b561f Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:28:56 +0600
Subject: [PATCH 07/94] add: basic layer for video swin

---
 .../layers/video_swin_transformer_layers.py   | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index 9ea37900a6..f5c93f18a6 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -508,4 +508,167 @@ def get_config(self):
                 "proj_drop_rate": self.proj_drop_rate,
             }
         )
+        return config
+    
+
+class BasicLayer(keras.Model):
+    """A basic Swin Transformer layer for one stage.
+
+    Args:
+        input_dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization
+        downsample (keras.layers | None, optional): Downsample layer at the end of the layer. Default: None
+
+    References:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
+    """  # noqa: E501
+    
+    def __init__(
+        self,
+        input_dim,
+        depth,
+        num_heads,
+        window_size=(1,7,7),
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.,
+        norm_layer=partial(layers.LayerNormalization, epsilon=1e-05),
+        downsample=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.shift_size = tuple([i // 2 for i in window_size])
+        self.depth = depth
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.norm_layer = norm_layer
+        self.downsample = downsample
+    
+    def _compute_dim_padded(self, input_dim, window_dim_size):
+        input_dim = ops.cast(input_dim, dtype="float32")
+        window_dim_size = ops.cast(window_dim_size, dtype="float32")
+        return ops.cast(
+            ops.ceil(input_dim / window_dim_size) * window_dim_size,
+            "int32"
+        )
+    
+    def build(self, input_shape):
+        window_size, shift_size = get_window_size(
+            input_shape[1:-1], self.window_size, self.shift_size
+        )
+        Dp = self._compute_dim_padded(input_shape[1], window_size[0])
+        Hp = self._compute_dim_padded(input_shape[2], window_size[1])
+        Wp = self._compute_dim_padded(input_shape[3], window_size[2])
+        self.attn_mask = compute_mask(
+            Dp, Hp, Wp, window_size, shift_size
+        )
+        
+        # build blocks
+        self.blocks = [
+            SwinTransformerBlock3D(
+                self.input_dim,
+                num_heads=self.num_heads,
+                window_size=self.window_size,
+                shift_size=(0,0,0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias,
+                qk_scale=self.qk_scale,
+                drop_rate=self.drop_rate,
+                attn_drop_rate=self.attn_drop_rate,
+                drop_path_rate=self.drop_path_rate[i] if isinstance(self.drop_path_rate, list) else self.drop_path_rate,
+                norm_layer=self.norm_layer,
+            )
+            for i in range(self.depth)
+        ]
+
+        if self.downsample is not None:
+            self.downsample = self.downsample(input_dim=self.input_dim, norm_layer=self.norm_layer)
+            self.downsample.build(input_shape)
+            
+        for i in range(self.depth):
+            self.blocks[i].build(input_shape)
+        
+        self.built = True
+        
+        
+    def compute_output_shape(self, input_shape):
+        window_size, _ = get_window_size(
+            input_shape[1:-1], self.window_size, self.shift_size
+        )
+        depth_p = self.compute_dim_padded(input_shape[1], window_size[0])
+        height_p = self.compute_dim_padded(input_shape[2], window_size[1])
+        width_p = self.compute_dim_padded(input_shape[3], window_size[2])
+        
+        if self.downsample is not None:
+            output_shape = (
+                input_shape[0], depth_p, height_p // 2, width_p // 2, 2*self.input_dim
+            )
+            return output_shape
+        
+        return input_shape
+
+    def call(self, x, training=None):
+        input_shape = ops.shape(x)
+        B,D,H,W,C = (
+            input_shape[0], 
+            input_shape[1],
+            input_shape[2],
+            input_shape[3],
+            input_shape[4],
+        )
+
+        for blk in self.blocks:
+            x = blk(
+                x, 
+                self.attn_mask,
+                training=training
+            )
+
+        x = ops.reshape(
+            x, [B, D, H, W, -1]
+        )
+ 
+        if self.downsample is not None:
+            x = self.downsample(x)
+            
+        return x
+    
+    
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "input_dim": self.input_dim,
+                "window_size": self.window_size,
+                "num_heads": self.num_heads,
+                "mlp_ratio": self.mlp_ratio,
+                "shift_size": self.shift_size,
+                "depth": self.depth,
+                "qkv_bias": self.qkv_bias,
+                "qk_scale": self.qk_scale,
+                "drop": self.drop,
+                "attn_drop": self.attn_drop,
+                "drop_path": self.drop_path
+            }
+        )
         return config
\ No newline at end of file

From 76d444bbbd8bca3c52967b8096b557ee1c2bcd26 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:32:12 +0600
Subject: [PATCH 08/94] update: basic layer for video swin

---
 .../layers/video_swin_transformer_layers.py   | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index f5c93f18a6..34315f2fd4 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -576,11 +576,11 @@ def build(self, input_shape):
         window_size, shift_size = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
         )
-        Dp = self._compute_dim_padded(input_shape[1], window_size[0])
-        Hp = self._compute_dim_padded(input_shape[2], window_size[1])
-        Wp = self._compute_dim_padded(input_shape[3], window_size[2])
+        depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
+        height_pad = self._compute_dim_padded(input_shape[2], window_size[1])
+        width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
         self.attn_mask = compute_mask(
-            Dp, Hp, Wp, window_size, shift_size
+            depth_pad, height_pad, width_pad, window_size, shift_size
         )
         
         # build blocks
@@ -602,7 +602,9 @@ def build(self, input_shape):
         ]
 
         if self.downsample is not None:
-            self.downsample = self.downsample(input_dim=self.input_dim, norm_layer=self.norm_layer)
+            self.downsample = self.downsample(
+                input_dim=self.input_dim, norm_layer=self.norm_layer
+            )
             self.downsample.build(input_shape)
             
         for i in range(self.depth):
@@ -629,7 +631,7 @@ def compute_output_shape(self, input_shape):
 
     def call(self, x, training=None):
         input_shape = ops.shape(x)
-        B,D,H,W,C = (
+        batch_size, depth, height, width, channel = (
             input_shape[0], 
             input_shape[1],
             input_shape[2],
@@ -637,15 +639,15 @@ def call(self, x, training=None):
             input_shape[4],
         )
 
-        for blk in self.blocks:
-            x = blk(
+        for block in self.blocks:
+            x = block(
                 x, 
                 self.attn_mask,
                 training=training
             )
 
         x = ops.reshape(
-            x, [B, D, H, W, -1]
+            x, [batch_size, depth, height, width, -1]
         )
  
         if self.downsample is not None:

From 715b8a39fdc0cef8f0beb3faf9d0ea1101601ff9 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 16:37:10 +0600
Subject: [PATCH 09/94] add: swin blocks for video swin

---
 .../layers/video_swin_transformer_layers.py   | 210 ++++++++++++++++++
 1 file changed, 210 insertions(+)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index 34315f2fd4..9c78397736 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -673,4 +673,214 @@ def get_config(self):
                 "drop_path": self.drop_path
             }
         )
+        return config
+    
+
+class SwinTransformerBlock3D(keras.Model):
+    """Swin Transformer Block.
+
+    Args:
+        input_dim (int): Number of feature channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optionalc): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (keras.layers.Activation, optional): Activation layer. Default: gelu
+        norm_layer (keras.layers, optional): Normalization layer.  Default: LayerNormalization
+
+    References:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
+    """  # noqa: E501
+    
+    def __init__(
+        self, 
+        input_dim, 
+        num_heads, 
+        window_size=(2, 7, 7), 
+        shift_size=(0, 0, 0),
+        mlp_ratio=4., 
+        qkv_bias=True, 
+        qk_scale=None, 
+        drop_rate=0., 
+        attn_drop_rate=0., 
+        drop_path_rate=0.,
+        activation='gelu',
+        norm_layer=layers.LayerNormalization, 
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # variables
+        self.input_dim = input_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.mlp_hidden_dim = int(input_dim * mlp_ratio)
+        self.norm_layer = norm_layer
+        self._activation_identifier = activation
+
+        for i, (shift, window) in enumerate(zip(self.shift_size, self.window_size)):
+            if not (0 <= shift < window):
+                raise ValueError(
+                    f"shift_size[{i}] must be in the range 0 to window_size[{i}]"
+                )
+
+    def build(self, input_shape):
+        self.window_size, self.shift_size = get_window_size(
+            input_shape[1:-1], self.window_size, self.shift_size
+        )
+        
+        self.apply_cyclic_shift = False
+        if any(i > 0 for i in self.shift_size):
+            self.apply_cyclic_shift = True
+   
+        # layers
+        self.drop_path = DropPath(self.drop_path_rate) if self.drop_path_rate > 0. else layers.Identity()  
+        
+        self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05)
+        self.norm1.build(input_shape)
+        
+        self.attn = WindowAttention3D(
+            self.input_dim, 
+            window_size=self.window_size, 
+            num_heads=self.num_heads, 
+            qkv_bias=self.qkv_bias, 
+            qk_scale=self.qk_scale, 
+            attn_drop_rate=self.attn_drop_rate,
+            proj_drop_rate=self.drop_rate
+        )
+        self.attn.build((None, None, self.input_dim))
+        
+        self.norm2 = self.norm_layer(axis=-1, epsilon=1e-05)
+        self.norm2.build((*input_shape[1:-1], self.input_dim))
+        
+        self.mlp = MLP(
+            output_dim=self.input_dim, 
+            hidden_dim=self.mlp_hidden_dim, 
+            activation=self._activation_identifier,
+            drop_rate=self.drop_rate
+        )
+        self.mlp.build((*input_shape[1:-1], self.input_dim))
+        self.built = True
+        
+    def first_forward(self, x, mask_matrix, training):
+        input_shape = ops.shape(x)
+        batch_size, depth, height, width, channel = (
+            input_shape[0], 
+            input_shape[1],
+            input_shape[2],
+            input_shape[3],
+            input_shape[4],
+        )
+        window_size, shift_size = self.window_size, self.shift_size
+        x = self.norm1(x)
+        
+        # pad feature maps to multiples of window size
+        pad_l  = pad_t = pad_d0 = 0
+        pad_d1 = ops.mod(-depth + window_size[0], window_size[0])
+        pad_b  = ops.mod(-height + window_size[1], window_size[1])
+        pad_r  = ops.mod(-width + window_size[2], window_size[2])
+        paddings = [[0, 0], [pad_d0, pad_d1], [pad_t, pad_b], [pad_l, pad_r], [0, 0]]
+        x = ops.pad(x, paddings)
+        
+        input_shape = ops.shape(x)
+        depth_p, height_p, width_p =  (
+            input_shape[1],
+            input_shape[2],
+            input_shape[3],
+        )
+        
+        # cyclic shift
+        if self.apply_cyclic_shift:
+            shifted_x = ops.roll(
+                x, 
+                shift=(-shift_size[0], -shift_size[1], -shift_size[2]), 
+                axis=(1, 2, 3)
+            )
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        
+        # partition windows
+        x_windows = window_partition(shifted_x, window_size) 
+        
+        # get attentions params
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask, training=training
+        ) 
+
+        # reverse the swin windows
+        shifted_x = window_reverse(
+            attn_windows, window_size, batch_size, depth_p, height_p, width_p
+        ) 
+
+        # reverse cyclic shift
+        if self.apply_cyclic_shift:
+            x = ops.roll(
+                shifted_x, 
+                shift=(shift_size[0], shift_size[1], shift_size[2]), 
+                axis=(1, 2, 3)
+            )
+        else:
+            x = shifted_x
+
+        # pad if required    
+        do_pad = ops.logical_or(
+            ops.greater(pad_d1, 0),
+            ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0))
+        )
+        x = ops.cond(
+            do_pad, 
+            lambda: x[:, :depth, :height, :width, :], 
+            lambda: x
+        )
+
+        return x
+
+    def second_forward(self, x, training):
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x, training=training)
+        return x
+        
+
+    def call(self, x, mask_matrix=None, training=None):
+        shortcut = x
+        x = self.first_forward(
+            x, mask_matrix, training
+        )
+        x = shortcut + self.drop_path(x)
+        x = x + self.second_forward(x, training)
+        return x
+    
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "input_dim": self.input_dim,
+                "window_size": self.num_heads,
+                "num_heads": self.window_size,
+                "shift_size": self.shift_size,
+                "mlp_ratio": self.mlp_ratio,
+                "qkv_bias": self.qkv_bias,
+                "qk_scale": self.qk_scale,
+                "drop_rate": self.drop_rate, 
+                "attn_drop_rate": self.attn_drop_rate,
+                "drop_path_rate": self.drop_path_rate,
+                "mlp_hidden_dim": self.mlp_hidden_dim,
+                "activation": self._activation_identifier
+            }
+        )
         return config
\ No newline at end of file

From 3ca00424aefc9a6dbf8a6b6dff76db84348666ee Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 17:58:26 +0600
Subject: [PATCH 10/94] create and add: video swin backbone

---
 .../video_swin/video_swin_backbone.py         | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone.py

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
new file mode 100644
index 0000000000..8d40787afc
--- /dev/null
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -0,0 +1,164 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+from functools import partial
+
+from keras import layers
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models import utils
+from keras_cv.models.backbones.backbone import Backbone
+from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import (
+    backbone_presets,
+)
+from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import (
+    backbone_presets_with_weights,
+)
+from keras_cv.utils.python_utils import classproperty
+
+
+@keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models")
+class VideoSwinBackbone(Backbone):
+    def __init__(
+        self,
+        *,
+        include_rescaling,
+        input_shape,
+        input_tensor, 
+        embed_dim, 
+        patch_size, 
+        window_size,
+        mlp_ratio,
+        patch_norm, 
+        drop_rate,
+        attn_drop_rate,
+        drop_path_rate,
+        depths,
+        num_heads,
+        qkv_bias,
+        qk_scale,
+        num_classes,
+        **kwargs
+    ):
+        
+        input_spec = utils.parse_model_inputs(
+            input_shape, input_tensor, name="videos"
+        )
+
+        # Check that the input video is well specified.
+        if input_spec.shape[-3] is None or input_spec.shape[-2] is None:
+            raise ValueError(
+                "Height and width of the video must be specified"
+                " in `input_shape`."
+            )
+        if input_spec.shape[-3] != input_spec.shape[-2]:
+            raise ValueError(
+                "Input video must be square i.e. the height must"
+                " be equal to the width in the `input_shape`"
+                " tuple/tensor."
+            )
+        
+        x = input_spec
+
+        if include_rescaling:
+            # Use common rescaling strategy across keras_cv
+            x = keras.layers.Rescaling(1.0 / 255.0)(x)
+
+        norm_layer = partial(layers.LayerNormalization, epsilon=1e-05)
+
+        x = PatchEmbed3D(
+            patch_size=patch_size,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if patch_norm else None,
+            name='PatchEmbed3D'
+        )(x)
+
+        x = layers.Dropout(drop_rate, name='pos_drop')(x)
+        dpr = np.linspace(0., drop_path_rate, sum(depths)).tolist()
+
+        num_layers = len(depths)
+        
+        for i in range(num_layers):
+            layer = BasicLayer(
+                input_dim=int(embed_dim * 2 ** i),
+                depth=depths[i],
+                num_heads=num_heads[i],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i < num_layers - 1) else None,
+                name=f'BasicLayer{i + 1}'
+            )
+            x = layer(x)
+
+        x = norm_layer(axis=-1, epsilon=1e-05, name='norm')(x)
+        x = layers.GlobalAveragePooling3D(name='gap3d')(x)
+        output = layers.Dense(
+            num_classes, use_bias=True, name='head', dtype='float32'
+        )(x)
+        super().__init__(inputs=input_spec, outputs=output, **kwargs)
+        
+        self.embed_dim = embed_dim
+        self.patch_size = patch_size
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.norm_layer = norm_layer
+        self.patch_norm = patch_norm
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.num_classes = num_classes
+        self.depths = depths
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "embed_dim": self.embed_dim,
+            "patch_norm": self.patch_norm,
+            "window_size": self.window_size,
+            "patch_size": self.patch_size,
+            "mlp_ratio": self.mlp_ratio,
+            "drop_rate": self.drop_rate,
+            "drop_path_rate": self.drop_path_rate,
+            "attn_drop_rate": self.attn_drop_rate,
+            "depths": self.depths,
+            "num_heads": self.num_heads,
+            "qkv_bias": self.qkv_bias,
+            "qk_scale": self.qk_scale,
+            "num_classes": self.num_classes,
+        })
+        return config
+    
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return copy.deepcopy(backbone_presets)
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return copy.deepcopy(backbone_presets_with_weights)
\ No newline at end of file

From 3d845c5f8560d3dddc2887439ec5c1cdcd92379c Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 18:02:50 +0600
Subject: [PATCH 11/94] rename: video swin layers to model specific

---
 keras_cv/layers/video_swin_transformer_layers.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_transformer_layers.py
index 9c78397736..7e87d27f1f 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_transformer_layers.py
@@ -215,8 +215,11 @@ def get_config(self):
         return config
     
 
-class PatchEmbed3D(keras.Model):
-    """Video to Patch Embedding layer.
+@keras_cv_export(
+    "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers"
+)
+class VideoSwinPatchingAndEmbedding(keras.Model):
+    """Video to Patch Embedding layer for Video Swin Model.
 
     Args:
         patch_size (int): Patch token size. Default: (2,4,4).
@@ -301,7 +304,7 @@ def get_config(self):
         return config
     
 
-class PatchMerging(layers.Layer):
+class VideoSwinPatchMerging(layers.Layer):
     """Patch Merging Layer.
 
     Args:
@@ -373,7 +376,7 @@ def get_config(self):
         return config
     
 
-class WindowAttention3D(keras.Model):
+class VideoSwinWindowAttention(keras.Model):
     """Window based multi-head self attention (W-MSA) module with relative position bias.
     It supports both of shifted and non-shifted window.
 
@@ -511,7 +514,7 @@ def get_config(self):
         return config
     
 
-class BasicLayer(keras.Model):
+class VideoSwinBasicLayer(keras.Model):
     """A basic Swin Transformer layer for one stage.
 
     Args:
@@ -676,7 +679,7 @@ def get_config(self):
         return config
     
 
-class SwinTransformerBlock3D(keras.Model):
+class VideoSwinTransformerBlock(keras.Model):
     """Swin Transformer Block.
 
     Args:

From 1af8bd4fdccf5065dde1921f428c641ad3dd8a31 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 18:08:44 +0600
Subject: [PATCH 12/94] update module import

---
 keras_cv/layers/__init__.py                            |  3 +++
 ...swin_transformer_layers.py => video_swin_layers.py} |  4 ++--
 .../models/backbones/video_swin/video_swin_backbone.py | 10 +++++++---
 3 files changed, 12 insertions(+), 5 deletions(-)
 rename keras_cv/layers/{video_swin_transformer_layers.py => video_swin_layers.py} (99%)

diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py
index 0bfa2aa8ec..957f5eda3c 100644
--- a/keras_cv/layers/__init__.py
+++ b/keras_cv/layers/__init__.py
@@ -141,3 +141,6 @@
 from keras_cv.layers.vit_det_layers import WindowedTransformerEncoder
 from keras_cv.layers.vit_det_layers import WindowPartitioning
 from keras_cv.layers.vit_layers import PatchingAndEmbedding
+from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
+from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
+from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
\ No newline at end of file
diff --git a/keras_cv/layers/video_swin_transformer_layers.py b/keras_cv/layers/video_swin_layers.py
similarity index 99%
rename from keras_cv/layers/video_swin_transformer_layers.py
rename to keras_cv/layers/video_swin_layers.py
index 7e87d27f1f..4f30cb5c32 100644
--- a/keras_cv/layers/video_swin_transformer_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -320,7 +320,7 @@ class VideoSwinPatchMerging(layers.Layer):
     def __init__(
         self, 
         input_dim, 
-        norm_layer=layers.LayerNormalization,
+        norm_layer=None,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -548,7 +548,7 @@ def __init__(
         drop_rate=0.,
         attn_drop_rate=0.,
         drop_path_rate=0.,
-        norm_layer=partial(layers.LayerNormalization, epsilon=1e-05),
+        norm_layer=None,
         downsample=None,
         **kwargs
     ):
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 8d40787afc..26cfd10bba 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -30,6 +30,10 @@
 )
 from keras_cv.utils.python_utils import classproperty
 
+from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
+from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
+from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
+
 
 @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models")
 class VideoSwinBackbone(Backbone):
@@ -80,7 +84,7 @@ def __init__(
 
         norm_layer = partial(layers.LayerNormalization, epsilon=1e-05)
 
-        x = PatchEmbed3D(
+        x = VideoSwinPatchingAndEmbedding(
             patch_size=patch_size,
             embed_dim=embed_dim,
             norm_layer=norm_layer if patch_norm else None,
@@ -93,7 +97,7 @@ def __init__(
         num_layers = len(depths)
         
         for i in range(num_layers):
-            layer = BasicLayer(
+            layer = VideoSwinBasicLayer(
                 input_dim=int(embed_dim * 2 ** i),
                 depth=depths[i],
                 num_heads=num_heads[i],
@@ -105,7 +109,7 @@ def __init__(
                 attn_drop_rate=attn_drop_rate,
                 drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
                 norm_layer=norm_layer,
-                downsample=PatchMerging if (i < num_layers - 1) else None,
+                downsample=VideoSwinPatchMerging if (i < num_layers - 1) else None,
                 name=f'BasicLayer{i + 1}'
             )
             x = layer(x)

From ed2864d5f93898d6e29d150cbda06c74cf1ac19f Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 18:10:02 +0600
Subject: [PATCH 13/94] update module import

---
 keras_cv/layers/video_swin_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 4f30cb5c32..3373f1bfcd 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -588,7 +588,7 @@ def build(self, input_shape):
         
         # build blocks
         self.blocks = [
-            SwinTransformerBlock3D(
+            VideoSwinTransformerBlock(
                 self.input_dim,
                 num_heads=self.num_heads,
                 window_size=self.window_size,

From bf70fa92a091f6a6ecb74abde698bc82cd76c876 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 18:20:19 +0600
Subject: [PATCH 14/94] set class method to private usage

---
 keras_cv/layers/video_swin_layers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 3373f1bfcd..739fb2cf7d 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -620,9 +620,9 @@ def compute_output_shape(self, input_shape):
         window_size, _ = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
         )
-        depth_p = self.compute_dim_padded(input_shape[1], window_size[0])
-        height_p = self.compute_dim_padded(input_shape[2], window_size[1])
-        width_p = self.compute_dim_padded(input_shape[3], window_size[2])
+        depth_p = self._compute_dim_padded(input_shape[1], window_size[0])
+        height_p = self._compute_dim_padded(input_shape[2], window_size[1])
+        width_p = self._compute_dim_padded(input_shape[3], window_size[2])
         
         if self.downsample is not None:
             output_shape = (
@@ -754,7 +754,7 @@ def build(self, input_shape):
         self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05)
         self.norm1.build(input_shape)
         
-        self.attn = WindowAttention3D(
+        self.attn = VideoSwinWindowAttention(
             self.input_dim, 
             window_size=self.window_size, 
             num_heads=self.num_heads, 

From eca5023cff8c834f51e440b0fe05464edee8d4d2 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 18:26:15 +0600
Subject: [PATCH 15/94] set init params for backbone

---
 .../video_swin/video_swin_backbone.py         | 38 ++++++++-----------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 26cfd10bba..0b10f5d9db 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -19,7 +19,6 @@
 from keras import layers
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
-from keras_cv.backend import ops
 from keras_cv.models import utils
 from keras_cv.models.backbones.backbone import Backbone
 from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import (
@@ -41,21 +40,20 @@ def __init__(
         self,
         *,
         include_rescaling,
-        input_shape,
-        input_tensor, 
-        embed_dim, 
-        patch_size, 
-        window_size,
-        mlp_ratio,
-        patch_norm, 
-        drop_rate,
-        attn_drop_rate,
-        drop_path_rate,
-        depths,
-        num_heads,
-        qkv_bias,
-        qk_scale,
-        num_classes,
+        input_shape=(32, 224, 224, 3),
+        input_tensor=None, 
+        embed_dim=96, 
+        patch_size=[2, 4, 4], 
+        window_size=[8, 7, 7],
+        mlp_ratio=4.0,
+        patch_norm=True, 
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        qkv_bias=True,
+        qk_scale=None,
         **kwargs
     ):
         
@@ -115,11 +113,7 @@ def __init__(
             x = layer(x)
 
         x = norm_layer(axis=-1, epsilon=1e-05, name='norm')(x)
-        x = layers.GlobalAveragePooling3D(name='gap3d')(x)
-        output = layers.Dense(
-            num_classes, use_bias=True, name='head', dtype='float32'
-        )(x)
-        super().__init__(inputs=input_spec, outputs=output, **kwargs)
+        super().__init__(inputs=input_spec, outputs=x, **kwargs)
         
         self.embed_dim = embed_dim
         self.patch_size = patch_size
@@ -134,7 +128,6 @@ def __init__(
         self.num_heads = num_heads
         self.qkv_bias = qkv_bias
         self.qk_scale = qk_scale
-        self.num_classes = num_classes
         self.depths = depths
 
     def get_config(self):
@@ -152,7 +145,6 @@ def get_config(self):
             "num_heads": self.num_heads,
             "qkv_bias": self.qkv_bias,
             "qk_scale": self.qk_scale,
-            "num_classes": self.num_classes,
         })
         return config
     

From 420e2291e8afb0985aa951fd7578b49af9407368 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 18:31:31 +0600
Subject: [PATCH 16/94] rm redundant imports

---
 .../video_swin/video_swin_backbone.py         | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 0b10f5d9db..4a069b78b7 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -21,12 +21,6 @@
 from keras_cv.backend import keras
 from keras_cv.models import utils
 from keras_cv.models.backbones.backbone import Backbone
-from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import (
-    backbone_presets,
-)
-from keras_cv.models.backbones.vit_det.vit_det_backbone_presets import (
-    backbone_presets_with_weights,
-)
 from keras_cv.utils.python_utils import classproperty
 
 from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
@@ -146,15 +140,4 @@ def get_config(self):
             "qkv_bias": self.qkv_bias,
             "qk_scale": self.qk_scale,
         })
-        return config
-    
-    @classproperty
-    def presets(cls):
-        """Dictionary of preset names and configurations."""
-        return copy.deepcopy(backbone_presets)
-
-    @classproperty
-    def presets_with_weights(cls):
-        """Dictionary of preset names and configurations that include
-        weights."""
-        return copy.deepcopy(backbone_presets_with_weights)
\ No newline at end of file
+        return config
\ No newline at end of file

From f73e25b35614a83112b51118419f622ad206b15d Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 18:36:42 +0600
Subject: [PATCH 17/94] add video swin layer test cases

---
 keras_cv/layers/video_swin_layers_test.py | 71 +++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 keras_cv/layers/video_swin_layers_test.py

diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py
new file mode 100644
index 0000000000..81bedcaa1b
--- /dev/null
+++ b/keras_cv/layers/video_swin_layers_test.py
@@ -0,0 +1,71 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from keras_cv.backend import ops
+from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
+from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention
+from keras_cv.tests.test_case import TestCase
+
+
+class TestVideoSwinPatchingAndEmbedding(TestCase):
+    def test_patch_embedding_compute_output_shape(self):
+        patch_embedding_model = VideoSwinPatchingAndEmbedding(
+            patch_size=(2, 4, 4), embed_dim=96, norm_layer=None
+        )
+        input_shape = (None, 16, 32, 32, 3)
+        output_shape = patch_embedding_model.compute_output_shape(input_shape)
+        expected_output_shape = (None, 8, 8, 8, 96)
+        self.assertEqual(output_shape, expected_output_shape)
+
+    def test_patch_embedding_get_config(self):
+        patch_embedding_model = VideoSwinPatchingAndEmbedding(
+            patch_size=(4, 4, 4), embed_dim=96
+        )
+        config = patch_embedding_model.get_config()
+        assert isinstance(config, dict)
+        assert config["patch_size"] == (4, 4, 4)
+        assert config["embed_dim"] == 96
+
+
+class TestVideoSwinWindowAttention(TestCase):
+    @pytest.fixture
+    def window_attention_model(self):
+        return VideoSwinWindowAttention(
+            window_size=(2, 4, 4),
+            num_heads=8,
+            qkv_bias=True,
+            qk_scale=None,
+            attn_drop_rate=0.1,
+            proj_drop_rate=0.1,
+        )
+
+    def test_window_attention_output_shape(self, window_attention_model):
+        input_shape = (4, 10, 256)
+        input_array = ops.ones(input_shape)
+        output_shape = window_attention_model(input_array).shape
+        expected_output_shape = input_shape
+        self.assertEqual(output_shape, expected_output_shape)
+
+    def test_window_attention_get_config(self, window_attention_model):
+        config = window_attention_model.get_config()
+        # Add assertions based on your specific requirements
+        assert isinstance(config, dict)
+        assert config["window_size"] == (2, 4, 4)
+        assert config["num_heads"] == 8
+        assert config["qkv_bias"] is True
+        assert config["qk_scale"] is None
+        assert config["attn_drop_rate"] == 0.1
+        assert config["proj_drop_rate"] == 0.1
\ No newline at end of file

From 1ccf7ee04486c33747417363119ca37b833449c9 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 19:13:06 +0600
Subject: [PATCH 18/94] add: videoswin backbone aliases

---
 .../models/backbones/video_swin/__init__.py   | 13 +++
 .../video_swin/video_swin_aliases.py          | 82 +++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 keras_cv/models/backbones/video_swin/__init__.py
 create mode 100644 keras_cv/models/backbones/video_swin/video_swin_aliases.py

diff --git a/keras_cv/models/backbones/video_swin/__init__.py b/keras_cv/models/backbones/video_swin/__init__.py
new file mode 100644
index 0000000000..1756010b15
--- /dev/null
+++ b/keras_cv/models/backbones/video_swin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
new file mode 100644
index 0000000000..2595011007
--- /dev/null
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -0,0 +1,82 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone
+from keras_cv.utils.python_utils import classproperty
+
+
+ALIAS_DOCSTRING = """VideoSwin{size}Backbone model.
+
+    Reference:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](https://keras.io/guides/transfer_learning/).
+
+    Examples:
+    ```python
+    input_data = np.ones(shape=(1, 32, 224, 224, 3))
+
+    # Randomly initialized backbone
+    model = VideoSwin{size}Backbone()
+    output = model(input_data)
+    ```
+"""  # noqa: E501
+
+class VideoSwinTBackbone(VideoSwinBackbone):
+    def __new__(
+        cls,
+        **kwargs,
+    ):
+        return VideoSwinBackbone.from_preset("videoswin_tiny", **kwargs)
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+    
+class VideoSwinSBackbone(VideoSwinBackbone):
+    def __new__(
+        cls,
+        **kwargs,
+    ):
+        return VideoSwinBackbone.from_preset("videoswin_small", **kwargs)
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+    
+class VideoSwinBBackbone(VideoSwinBackbone):
+    def __new__(
+        cls,
+        **kwargs,
+    ):
+        return VideoSwinBackbone.from_preset("videoswin_base", **kwargs)
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+    
+
+setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T"))
+setattr(VideoSwinSBackbone, "__doc__", ALIAS_DOCSTRING.format(size="S"))
+setattr(VideoSwinBBackbone, "__doc__", ALIAS_DOCSTRING.format(size="B"))
\ No newline at end of file

From c5d5fa2590e97cf20546de8dfd53051103b0b985 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 19:32:49 +0600
Subject: [PATCH 19/94] add: video swin backbone presets

---
 .../video_swin/video_swin_backbone_presets.py | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
new file mode 100644
index 0000000000..7d7ebec73f
--- /dev/null
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -0,0 +1,49 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Video Swin model preset configurations."""
+
+backbone_presets_no_weights = {
+    "videoswin_tiny": {
+        "metadata": {
+            "description": (
+                "Video Swin backbone " # TODO: update
+            ),
+            "params": 27_850_470,
+            "official_name": "VideoSwinT",
+            "path": "video_swin",
+        },
+    },
+
+    "videoswin_small": {
+        "metadata": {
+            "description": (
+                "Video Swin backbone " # TODO: update
+            ),
+            "params": 49_509_078,
+            "official_name": "VideoSwinS",
+            "path": "video_swin",
+        },
+    },
+
+    "videoswin_base": {
+        "metadata": {
+            "description": (
+                "Video Swin backbone " # TODO: update
+            ),
+            "params": 87_638_984,
+            "official_name": "VideoSwinB",
+            "path": "video_swin",
+        },
+    },
+}
\ No newline at end of file

From 27b65967cd6d46940602f2f91a5b0f5e18430b9d Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 19:40:51 +0600
Subject: [PATCH 20/94] add: video swin backbone presets test

---
 .../video_swin_backbone_presets_test.py       | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
new file mode 100644
index 0000000000..9d48b475ca
--- /dev/null
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
@@ -0,0 +1,54 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for loading pretrained model presets."""
+
+import numpy as np
+import pytest
+
+from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone
+from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinTBackbone
+from keras_cv.tests.test_case import TestCase
+
+@pytest.mark.large
+class VideoSwinPresetSmokeTest(TestCase):
+    """
+    A smoke test for VideoSwin presets we run continuously.
+    This only tests the smallest weights we have available. Run with:
+    `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_large`  # noqa: E501
+    """
+
+    def setUp(self):
+        self.input_batch = np.ones(shape=(1, 32, 224, 224, 3))
+
+    def test_applications_model_output(self):
+        model = VideoSwinBackbone()
+        model(self.input_batch)
+
+    def test_applications_model_output_with_preset(self):
+        model = VideoSwinBackbone.from_preset("videoswin_tiny")
+        model(self.input_batch)
+
+    def test_applications_model_predict(self):
+        model = VideoSwinTBackbone()
+        model.predict(self.input_batch)
+
+    def test_preset_docstring(self):
+        """Check we did our docstring formatting correctly."""
+        for name in VideoSwinBackbone.presets:
+            self.assertRegex(VideoSwinBackbone.from_preset.__doc__, name)
+
+    def test_unknown_preset_error(self):
+        # Not a preset name
+        with self.assertRaises(ValueError):
+            VideoSwinBackbone.from_preset("videoswin_nonexistant")
\ No newline at end of file

From 814db52f5aeaa6982aa5c86ea6c987957970284c Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 19:42:18 +0600
Subject: [PATCH 21/94] update: video swin backbone presets test

---
 .../video_swin_backbone_presets_test.py       | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
index 9d48b475ca..77d80bb4d3 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
@@ -22,8 +22,7 @@
 
 @pytest.mark.large
 class VideoSwinPresetSmokeTest(TestCase):
-    """
-    A smoke test for VideoSwin presets we run continuously.
+    """A smoke test for VideoSwin presets we run continuously.
     This only tests the smallest weights we have available. Run with:
     `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_large`  # noqa: E501
     """
@@ -51,4 +50,19 @@ def test_preset_docstring(self):
     def test_unknown_preset_error(self):
         # Not a preset name
         with self.assertRaises(ValueError):
-            VideoSwinBackbone.from_preset("videoswin_nonexistant")
\ No newline at end of file
+            VideoSwinBackbone.from_preset("videoswin_nonexistant")
+
+
+@pytest.mark.extra_large
+class VideoSwinPresetFullTest(TestCase):
+    """Test the full enumeration of our preset.
+    This tests every preset for VideoSwin and is only run manually.
+    Run with:
+    `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_extra_large`  # noqa: E501
+    """
+
+    def test_load_ViTDet(self):
+        input_data = np.ones(shape=(1, 1024, 1024, 3))
+        for preset in VideoSwinBackbone.presets:
+            model = VideoSwinBackbone.from_preset(preset)
+            model(input_data)
\ No newline at end of file

From cc6ac2126d06b1b77ab0c8879346243b877cbe29 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 19:51:03 +0600
Subject: [PATCH 22/94] add: video classifier task

---
 .../models/classification/video_classifier.py | 146 ++++++++++++++++++
 .../video_classifier_presets.py               |  14 ++
 2 files changed, 160 insertions(+)
 create mode 100644 keras_cv/models/classification/video_classifier.py
 create mode 100644 keras_cv/models/classification/video_classifier_presets.py

diff --git a/keras_cv/models/classification/video_classifier.py b/keras_cv/models/classification/video_classifier.py
new file mode 100644
index 0000000000..2d5b7f61ea
--- /dev/null
+++ b/keras_cv/models/classification/video_classifier.py
@@ -0,0 +1,146 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Video classifier model using pooling and dense layers."""
+
+import copy
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.models.task import Task
+from keras_cv.utils.python_utils import classproperty
+from keras_cv.models.backbones.backbone_presets import backbone_presets
+from keras_cv.models.backbones.backbone_presets import (
+    backbone_presets_with_weights,
+)
+from keras_cv.models.classification.video_classifier_presets import (
+    classifier_presets,
+)
+
+@keras_cv_export(
+    [
+        "keras_cv.models.VideoClassifier",
+        "keras_cv.models.classification.VideoClassifier",
+    ]
+)
+class VideoClassifier(Task):
+    """Video classifier with pooling and dense layer prediction head.
+
+    Args:
+        backbone: `keras.Model` instance, the backbone architecture of the
+            classifier called on the inputs. Pooling will be called on the last
+            dimension of the backbone output.
+        num_classes: int, number of classes to predict.
+        pooling: str, type of pooling layer. Must be one of "avg", "max".
+        activation: Optional `str` or callable, defaults to "softmax". The
+            activation function to use on the Dense layer. Set `activation=None`
+            to return the output logits.
+
+    Example:
+    ```python
+    input_data = keras.ops.ones(shape=(1, 32, 224, 224, 3))
+
+    # Pretrained classifier (e.g., for imagenet categories)
+    model = keras_cv.models.VideoClassifier.from_preset(
+        "videoswin_tiny_imagenet_classifier",
+    )
+    output = model(input_data)
+
+    # Pretrained backbone
+    backbone = keras_cv.models.VideoSwinBackbone.from_preset(
+        "videoswin_tiny_imagenet",
+    )
+    model = keras_cv.models.VideoClassifier(
+        backbone=backbone,
+        num_classes=400,
+    )
+    output = model(input_data)
+
+    # Randomly initialized backbone with a custom config
+    model = keras_cv.models.VideoClassifier(
+        backbone=keras_cv.models.VideoSwinBackbone(),
+        num_classes=400,
+    )
+    output = model(input_data)
+    ```
+    """
+
+    def __init__(
+        self,
+        backbone,
+        num_classes,
+        pooling="avg",
+        activation="softmax",
+        **kwargs,
+    ):
+        if pooling == "avg":
+            pooling_layer = keras.layers.GlobalAveragePooling3D(name="avg_pool")
+        elif pooling == "max":
+            pooling_layer = keras.layers.GlobalMaxPooling3D(name="max_pool")
+        else:
+            raise ValueError(
+                f'`pooling` must be one of "avg", "max". Received: {pooling}.'
+            )
+        inputs = backbone.input
+        x = backbone(inputs)
+        x = pooling_layer(x)
+        outputs = keras.layers.Dense(
+            num_classes,
+            activation=activation,
+            name="predictions",
+            dtype='float32'
+        )(x)
+
+        # Instantiate using Functional API Model constructor
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
+        # All references to `self` below this line
+        self.backbone = backbone
+        self.num_classes = num_classes
+        self.pooling = pooling
+        self.activation = activation
+
+    def get_config(self):
+        # Backbone serialized in `super`
+        config = super().get_config()
+        config.update(
+            {
+                "backbone": keras.layers.serialize(self.backbone),
+                "num_classes": self.num_classes,
+                "pooling": self.pooling,
+                "activation": self.activation,
+            }
+        )
+        return config
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return copy.deepcopy({**backbone_presets, **classifier_presets})
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return copy.deepcopy(
+            {**backbone_presets_with_weights, **classifier_presets}
+        )
+
+    @classproperty
+    def backbone_presets(cls):
+        """Dictionary of preset names and configurations of compatible
+        backbones."""
+        return copy.deepcopy(backbone_presets)
\ No newline at end of file
diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py
new file mode 100644
index 0000000000..8c3d9a2d71
--- /dev/null
+++ b/keras_cv/models/classification/video_classifier_presets.py
@@ -0,0 +1,14 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VideoClassifier Task presets."""
\ No newline at end of file

From d2d883d741063dc9fb12b36b981a635d5859e227 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 19:57:07 +0600
Subject: [PATCH 23/94] add: video swin classifier presets

---
 .../video_classifier_presets.py               | 45 ++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py
index 8c3d9a2d71..2a8447bd20 100644
--- a/keras_cv/models/classification/video_classifier_presets.py
+++ b/keras_cv/models/classification/video_classifier_presets.py
@@ -11,4 +11,47 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""VideoClassifier Task presets."""
\ No newline at end of file
+"""VideoClassifier Task presets."""
+
+classifier_presets = {
+    "videoswin_tiny_kinetics_classifier": {
+        "metadata": {
+            "description": (
+                "videoswin_tiny_kinetics " # TODO: update
+            ),
+            "params": 25_613_800,
+            "official_name": "VideoClassifier",
+            "path": "video_classifier",
+        },
+    },
+    "videoswin_small_kinetics_classifier": {
+        "metadata": {
+            "description": (
+                "videoswin_small_kinetics " # TODO: update
+            ),
+            "params": 25_613_800, # TODO: update
+            "official_name": "VideoClassifier",
+            "path": "video_classifier",
+        },
+    },
+    "videoswin_base_kinetics_classifier": {
+        "metadata": {
+            "description": (
+                "videoswin_base_kinetics " # TODO: update
+            ),
+            "params": 25_613_800, # TODO: update
+            "official_name": "VideoClassifier",
+            "path": "video_classifier",
+        },
+    },
+    "videoswin_base_something_something_v2_classifier": {
+        "metadata": {
+            "description": (
+                "videoswin_base_something_something_v2 " # TODO: update
+            ),
+            "params": 25_613_800, # TODO: update
+            "official_name": "VideoClassifier",
+            "path": "video_classifier",
+        },
+    },
+}
\ No newline at end of file

From 125b2dc7fee2d7daeafb74a7a56670fcc06e48a8 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Fri, 1 Mar 2024 20:00:38 +0600
Subject: [PATCH 24/94] run formatters

---
 .../models/backbones/video_swin/__init__.py   |  2 +-
 .../video_swin/video_swin_aliases.py          | 17 ++--
 .../video_swin/video_swin_backbone.py         | 77 ++++++++++---------
 .../video_swin/video_swin_backbone_presets.py | 16 +---
 .../video_swin_backbone_presets_test.py       | 11 ++-
 5 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/__init__.py b/keras_cv/models/backbones/video_swin/__init__.py
index 1756010b15..3992ffb59a 100644
--- a/keras_cv/models/backbones/video_swin/__init__.py
+++ b/keras_cv/models/backbones/video_swin/__init__.py
@@ -10,4 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 2595011007..e18ca41f57 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
-from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone
+from keras_cv.models.backbones.video_swin.video_swin_backbone import (
+    VideoSwinBackbone,
+)
 from keras_cv.utils.python_utils import classproperty
 
-
 ALIAS_DOCSTRING = """VideoSwin{size}Backbone model.
 
     Reference:
@@ -37,6 +37,7 @@
     ```
 """  # noqa: E501
 
+
 class VideoSwinTBackbone(VideoSwinBackbone):
     def __new__(
         cls,
@@ -49,7 +50,8 @@ def presets_with_weights(cls):
         """Dictionary of preset names and configurations that include
         weights."""
         return cls.presets
-    
+
+
 class VideoSwinSBackbone(VideoSwinBackbone):
     def __new__(
         cls,
@@ -62,7 +64,8 @@ def presets_with_weights(cls):
         """Dictionary of preset names and configurations that include
         weights."""
         return cls.presets
-    
+
+
 class VideoSwinBBackbone(VideoSwinBackbone):
     def __new__(
         cls,
@@ -75,8 +78,8 @@ def presets_with_weights(cls):
         """Dictionary of preset names and configurations that include
         weights."""
         return cls.presets
-    
+
 
 setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T"))
 setattr(VideoSwinSBackbone, "__doc__", ALIAS_DOCSTRING.format(size="S"))
-setattr(VideoSwinBBackbone, "__doc__", ALIAS_DOCSTRING.format(size="B"))
\ No newline at end of file
+setattr(VideoSwinBBackbone, "__doc__", ALIAS_DOCSTRING.format(size="B"))
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 4a069b78b7..014b39fc30 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -12,20 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import numpy as np
 from functools import partial
 
+import numpy as np
 from keras import layers
+
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
-from keras_cv.models import utils
-from keras_cv.models.backbones.backbone import Backbone
-from keras_cv.utils.python_utils import classproperty
-
 from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
 from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
 from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
+from keras_cv.models import utils
+from keras_cv.models.backbones.backbone import Backbone
 
 
 @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models")
@@ -35,12 +33,12 @@ def __init__(
         *,
         include_rescaling,
         input_shape=(32, 224, 224, 3),
-        input_tensor=None, 
-        embed_dim=96, 
-        patch_size=[2, 4, 4], 
+        input_tensor=None,
+        embed_dim=96,
+        patch_size=[2, 4, 4],
         window_size=[8, 7, 7],
         mlp_ratio=4.0,
-        patch_norm=True, 
+        patch_norm=True,
         drop_rate=0.0,
         attn_drop_rate=0.0,
         drop_path_rate=0.2,
@@ -48,9 +46,8 @@ def __init__(
         num_heads=[3, 6, 12, 24],
         qkv_bias=True,
         qk_scale=None,
-        **kwargs
+        **kwargs,
     ):
-        
         input_spec = utils.parse_model_inputs(
             input_shape, input_tensor, name="videos"
         )
@@ -67,7 +64,7 @@ def __init__(
                 " be equal to the width in the `input_shape`"
                 " tuple/tensor."
             )
-        
+
         x = input_spec
 
         if include_rescaling:
@@ -80,17 +77,17 @@ def __init__(
             patch_size=patch_size,
             embed_dim=embed_dim,
             norm_layer=norm_layer if patch_norm else None,
-            name='PatchEmbed3D'
+            name="PatchEmbed3D",
         )(x)
 
-        x = layers.Dropout(drop_rate, name='pos_drop')(x)
-        dpr = np.linspace(0., drop_path_rate, sum(depths)).tolist()
+        x = layers.Dropout(drop_rate, name="pos_drop")(x)
+        dpr = np.linspace(0.0, drop_path_rate, sum(depths)).tolist()
 
         num_layers = len(depths)
-        
+
         for i in range(num_layers):
             layer = VideoSwinBasicLayer(
-                input_dim=int(embed_dim * 2 ** i),
+                input_dim=int(embed_dim * 2**i),
                 depth=depths[i],
                 num_heads=num_heads[i],
                 window_size=window_size,
@@ -99,16 +96,18 @@ def __init__(
                 qk_scale=qk_scale,
                 drop_rate=drop_rate,
                 attn_drop_rate=attn_drop_rate,
-                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])],
                 norm_layer=norm_layer,
-                downsample=VideoSwinPatchMerging if (i < num_layers - 1) else None,
-                name=f'BasicLayer{i + 1}'
+                downsample=VideoSwinPatchMerging
+                if (i < num_layers - 1)
+                else None,
+                name=f"BasicLayer{i + 1}",
             )
             x = layer(x)
 
-        x = norm_layer(axis=-1, epsilon=1e-05, name='norm')(x)
+        x = norm_layer(axis=-1, epsilon=1e-05, name="norm")(x)
         super().__init__(inputs=input_spec, outputs=x, **kwargs)
-        
+
         self.embed_dim = embed_dim
         self.patch_size = patch_size
         self.window_size = window_size
@@ -126,18 +125,20 @@ def __init__(
 
     def get_config(self):
         config = super().get_config()
-        config.update({
-            "embed_dim": self.embed_dim,
-            "patch_norm": self.patch_norm,
-            "window_size": self.window_size,
-            "patch_size": self.patch_size,
-            "mlp_ratio": self.mlp_ratio,
-            "drop_rate": self.drop_rate,
-            "drop_path_rate": self.drop_path_rate,
-            "attn_drop_rate": self.attn_drop_rate,
-            "depths": self.depths,
-            "num_heads": self.num_heads,
-            "qkv_bias": self.qkv_bias,
-            "qk_scale": self.qk_scale,
-        })
-        return config
\ No newline at end of file
+        config.update(
+            {
+                "embed_dim": self.embed_dim,
+                "patch_norm": self.patch_norm,
+                "window_size": self.window_size,
+                "patch_size": self.patch_size,
+                "mlp_ratio": self.mlp_ratio,
+                "drop_rate": self.drop_rate,
+                "drop_path_rate": self.drop_path_rate,
+                "attn_drop_rate": self.attn_drop_rate,
+                "depths": self.depths,
+                "num_heads": self.num_heads,
+                "qkv_bias": self.qkv_bias,
+                "qk_scale": self.qk_scale,
+            }
+        )
+        return config
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
index 7d7ebec73f..ff7827f3b8 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -16,34 +16,26 @@
 backbone_presets_no_weights = {
     "videoswin_tiny": {
         "metadata": {
-            "description": (
-                "Video Swin backbone " # TODO: update
-            ),
+            "description": ("Video Swin backbone "),  # TODO: update
             "params": 27_850_470,
             "official_name": "VideoSwinT",
             "path": "video_swin",
         },
     },
-
     "videoswin_small": {
         "metadata": {
-            "description": (
-                "Video Swin backbone " # TODO: update
-            ),
+            "description": ("Video Swin backbone "),  # TODO: update
             "params": 49_509_078,
             "official_name": "VideoSwinS",
             "path": "video_swin",
         },
     },
-
     "videoswin_base": {
         "metadata": {
-            "description": (
-                "Video Swin backbone " # TODO: update
-            ),
+            "description": ("Video Swin backbone "),  # TODO: update
             "params": 87_638_984,
             "official_name": "VideoSwinB",
             "path": "video_swin",
         },
     },
-}
\ No newline at end of file
+}
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
index 77d80bb4d3..88b4763204 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
@@ -16,10 +16,15 @@
 import numpy as np
 import pytest
 
-from keras_cv.models.backbones.video_swin.video_swin_backbone import VideoSwinBackbone
-from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinTBackbone
+from keras_cv.models.backbones.video_swin.video_swin_aliases import (
+    VideoSwinTBackbone,
+)
+from keras_cv.models.backbones.video_swin.video_swin_backbone import (
+    VideoSwinBackbone,
+)
 from keras_cv.tests.test_case import TestCase
 
+
 @pytest.mark.large
 class VideoSwinPresetSmokeTest(TestCase):
     """A smoke test for VideoSwin presets we run continuously.
@@ -65,4 +70,4 @@ def test_load_ViTDet(self):
         input_data = np.ones(shape=(1, 1024, 1024, 3))
         for preset in VideoSwinBackbone.presets:
             model = VideoSwinBackbone.from_preset(preset)
-            model(input_data)
\ No newline at end of file
+            model(input_data)

From 98273022524781ef33f08614ff597e82d64674db Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 19:12:25 +0600
Subject: [PATCH 25/94] rename module name/id"

---
 keras_cv/layers/video_swin_layers.py                      | 5 +++--
 .../models/backbones/video_swin/video_swin_backbone.py    | 8 +++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 739fb2cf7d..af682ea27c 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from functools import partial
-
 import numpy as np
 from keras import layers
 
@@ -679,6 +677,9 @@ def get_config(self):
         return config
     
 
+@keras_cv_export(
+    "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers"
+)
 class VideoSwinTransformerBlock(keras.Model):
     """Swin Transformer Block.
 
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 014b39fc30..2d06a48a43 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -77,14 +77,12 @@ def __init__(
             patch_size=patch_size,
             embed_dim=embed_dim,
             norm_layer=norm_layer if patch_norm else None,
-            name="PatchEmbed3D",
+            name="videoswin_patching_and_embedding",
         )(x)
-
         x = layers.Dropout(drop_rate, name="pos_drop")(x)
-        dpr = np.linspace(0.0, drop_path_rate, sum(depths)).tolist()
 
+        dpr = np.linspace(0.0, drop_path_rate, sum(depths)).tolist()
         num_layers = len(depths)
-
         for i in range(num_layers):
             layer = VideoSwinBasicLayer(
                 input_dim=int(embed_dim * 2**i),
@@ -101,7 +99,7 @@ def __init__(
                 downsample=VideoSwinPatchMerging
                 if (i < num_layers - 1)
                 else None,
-                name=f"BasicLayer{i + 1}",
+                name=f"videoswin_basic_layer_{i + 1}",
             )
             x = layer(x)
 

From 89a715aaaf44e5be4ad8a3a03d4123c46df6efc0 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 19:22:18 +0600
Subject: [PATCH 26/94] add hard-coded normalization for include rescaling=true

---
 .../backbones/video_swin/video_swin_backbone.py       | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 2d06a48a43..48d66cead0 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 from keras import layers
+from keras_cv.backend import ops
 
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
@@ -71,6 +72,16 @@ def __init__(
             # Use common rescaling strategy across keras_cv
             x = keras.layers.Rescaling(1.0 / 255.0)(x)
 
+            # Video Swin scales inputs based on the standard ImageNet mean/stddev.
+            # Officially, Videw Swin takes tensor of [0-255] ranges. 
+            # And use mean=[123.675, 116.28, 103.53] and 
+            # std=[58.395, 57.12, 57.375] for normalization. 
+            # So, if include_rescaling is set to True, then, to match with the 
+            # official scores, following normalization should be added.
+            x = (x - ops.array([0.485, 0.456, 0.406], dtype=x.dtype)) / (
+                ops.array([0.229, 0.224, 0.225], dtype=x.dtype)
+            )
+
         norm_layer = partial(layers.LayerNormalization, epsilon=1e-05)
 
         x = VideoSwinPatchingAndEmbedding(

From 36db030b4398f4051553f6b2f084069e1b1e730c Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 19:37:17 +0600
Subject: [PATCH 27/94] add docstring for videoswin backbone

---
 .../video_swin/video_swin_backbone.py         | 43 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 48d66cead0..6efee5d5d6 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -29,6 +29,45 @@
 
 @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models")
 class VideoSwinBackbone(Backbone):
+    """A Video Swin Transformer backbone model.
+
+    Args:
+        input_shape (tuple[int], optional): The size of the input image in
+            `(depth, height, width, channel)` format. 
+            Defaults to `(32, 224, 224, 3)`.
+        input_tensor (KerasTensor, optional): Output of
+            `keras.layers.Input()`) to use as image input for the model.
+            Defaults to `None`.
+        include_rescaling (bool, optional): Whether to rescale the inputs. If
+            set to `True`, inputs will be passed through a
+            `Rescaling(1/255.0)` layer and normalize with
+            mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225],
+            Defaults to `False`.
+        patch_size (int | tuple(int)): Patch size. Default: (2,4,4).
+        embed_dim (int): Number of linear projection output channels.
+            Default to 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default to [2, 2, 6, 2]
+        num_heads (tuple[int]): Number of attention head of each stage.
+            Default to [3, 6, 12, 24]
+        window_size (int): Window size. Default to [8, 7, 7].
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. 
+            Default to 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. 
+            Default to True.
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+            Default to None.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        patch_norm (bool): If True, add normalization after patch embedding. 
+            Default to False.
+ 
+    References:
+        - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
+        - [Official Code](https://github.com/SwinTransformer/Video-Swin-Transformer)
+    """  # noqa: E501
+
     def __init__(
         self,
         *,
@@ -54,9 +93,9 @@ def __init__(
         )
 
         # Check that the input video is well specified.
-        if input_spec.shape[-3] is None or input_spec.shape[-2] is None:
+        if input_spec.shape[-4] is None or input_spec.shape[-3] is None or input_spec.shape[-2] is None:
             raise ValueError(
-                "Height and width of the video must be specified"
+                "Depth, Height and width of the video must be specified"
                 " in `input_shape`."
             )
         if input_spec.shape[-3] != input_spec.shape[-2]:

From 7aa27a4aa881f906c8c00c89a59d86053d43ef7d Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 21:51:22 +0600
Subject: [PATCH 28/94] update metadata: backbone presets no weights

---
 .../backbones/video_swin/video_swin_backbone_presets.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
index ff7827f3b8..f1e330ea33 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -16,7 +16,7 @@
 backbone_presets_no_weights = {
     "videoswin_tiny": {
         "metadata": {
-            "description": ("Video Swin backbone "),  # TODO: update
+            "description": ("A tiny Video Swin backbone architecture."),
             "params": 27_850_470,
             "official_name": "VideoSwinT",
             "path": "video_swin",
@@ -24,7 +24,7 @@
     },
     "videoswin_small": {
         "metadata": {
-            "description": ("Video Swin backbone "),  # TODO: update
+            "description": ("A small Video Swin backbone architecture."),
             "params": 49_509_078,
             "official_name": "VideoSwinS",
             "path": "video_swin",
@@ -32,7 +32,7 @@
     },
     "videoswin_base": {
         "metadata": {
-            "description": ("Video Swin backbone "),  # TODO: update
+            "description": ("A base Video Swin backbone architecture."),
             "params": 87_638_984,
             "official_name": "VideoSwinB",
             "path": "video_swin",

From 62a87032d4272580b1539478d96e3a4db0ea80b4 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 22:10:56 +0600
Subject: [PATCH 29/94] update: backbone presets no weights test

---
 .../backbones/video_swin/video_swin_backbone_presets_test.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
index 88b4763204..80996fcbfa 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
@@ -28,7 +28,7 @@
 @pytest.mark.large
 class VideoSwinPresetSmokeTest(TestCase):
     """A smoke test for VideoSwin presets we run continuously.
-    This only tests the smallest weights we have available. Run with:
+    Run with:
     `pytest keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py --run_large`  # noqa: E501
     """
 
@@ -67,7 +67,7 @@ class VideoSwinPresetFullTest(TestCase):
     """
 
     def test_load_ViTDet(self):
-        input_data = np.ones(shape=(1, 1024, 1024, 3))
+        input_data = np.ones(shape=(1, 32, 224, 224, 3))
         for preset in VideoSwinBackbone.presets:
             model = VideoSwinBackbone.from_preset(preset)
             model(input_data)

From aad56618b101b91dada422627618508f6a521292 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 22:38:18 +0600
Subject: [PATCH 30/94] update video swin aliases for no weights

---
 .../video_swin/video_swin_aliases.py          | 39 +++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index e18ca41f57..0ffeaa7ab1 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -41,43 +41,58 @@
 class VideoSwinTBackbone(VideoSwinBackbone):
     def __new__(
         cls,
+        include_rescaling=False,
         **kwargs,
     ):
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+            }
+        )
         return VideoSwinBackbone.from_preset("videoswin_tiny", **kwargs)
 
     @classproperty
-    def presets_with_weights(cls):
-        """Dictionary of preset names and configurations that include
-        weights."""
-        return cls.presets
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
 
 
 class VideoSwinSBackbone(VideoSwinBackbone):
     def __new__(
         cls,
+        include_rescaling=False,
         **kwargs,
     ):
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+            }
+        )
         return VideoSwinBackbone.from_preset("videoswin_small", **kwargs)
 
     @classproperty
-    def presets_with_weights(cls):
-        """Dictionary of preset names and configurations that include
-        weights."""
-        return cls.presets
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
 
 
 class VideoSwinBBackbone(VideoSwinBackbone):
     def __new__(
         cls,
+        include_rescaling=False,
         **kwargs,
     ):
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+            }
+        )
         return VideoSwinBackbone.from_preset("videoswin_base", **kwargs)
 
     @classproperty
-    def presets_with_weights(cls):
-        """Dictionary of preset names and configurations that include
-        weights."""
-        return cls.presets
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
 
 
 setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T"))

From 048d85ad3b4c527ab93abb9bbfca60f67ba36b09 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 22:53:46 +0600
Subject: [PATCH 31/94] add: video swin backbone presets with weights

---
 .../video_swin/video_swin_backbone_presets.py | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
index f1e330ea33..801c5de906 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -39,3 +39,66 @@
         },
     },
 }
+
+backbone_presets_with_weights = {
+    "videoswin_tiny_kinetics400": {
+        "metadata": {
+            "description": (
+                "A tiny Video Swin backbone architecture. "
+                "It is pretrained on ImageNet 1K dataset, and "
+                "trained on Kinetics 400 dataset."
+            ),
+            "params": 27_850_470,
+            "official_name": "VideoSwinT",
+            "path": "video_swin",
+        },
+    },
+    "videoswin_small_kinetics400": {
+        "metadata": {
+            "description": (
+                "A small Video Swin backbone architecture. "
+                "It is pretrained on ImageNet 1K dataset, and "
+                "trained on Kinetics 400 dataset."
+            ),
+            "params": 49_509_078,
+            "official_name": "VideoSwinS",
+            "path": "video_swin",
+        },
+    },
+    "videoswin_base_kinetics400": {
+        "metadata": {
+            "description": (
+                "A base Video Swin backbone architecture. "
+                "It is pretrained on ImageNet 1K dataset, and "
+                "trained on Kinetics 400 dataset."
+            ),
+            "params": 87_638_984,
+            "official_name": "VideoSwinB",
+            "path": "video_swin",
+        },
+    },
+    "videoswin_base_kinetics600": {
+        "metadata": {
+            "description": (
+                "A base Video Swin backbone architecture. "
+                "It is pretrained on ImageNet 22K dataset, and "
+                "trained on Kinetics 600 dataset."
+            ),
+            "params": 87_638_984,
+            "official_name": "VideoSwinB",
+            "path": "video_swin",
+        },
+    },
+    "videoswin_base_something_something_v2": {
+        "metadata": {
+            "description": (
+                "A base Video Swin backbone architecture. "
+                "It is pretrained on Kinetics 400 dataset, and "
+                "trained on Something Something V2 dataset."
+            ),
+            "params": 87_638_984,
+            "official_name": "VideoSwinB",
+            "path": "video_swin",
+        },
+    },
+}

From 1423e838e71486ba73d5a155c013009aa7ff9b9a Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 22:59:10 +0600
Subject: [PATCH 32/94] update: video swin aliases with weights presets

---
 .../video_swin/video_swin_aliases.py          | 39 +++++++++++++++++--
 .../video_swin/video_swin_backbone_presets.py |  5 +++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 0ffeaa7ab1..ca31cac84e 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import copy
 from keras_cv.models.backbones.video_swin.video_swin_backbone import (
     VideoSwinBackbone,
 )
 from keras_cv.utils.python_utils import classproperty
+from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets
 
 ALIAS_DOCSTRING = """VideoSwin{size}Backbone model.
 
@@ -54,7 +55,17 @@ def __new__(
     @classproperty
     def presets(cls):
         """Dictionary of preset names and configurations."""
-        return {}
+        return {
+            "videoswin_tiny_kinetics400": copy.deepcopy(
+                backbone_presets["videoswin_tiny_kinetics400"]
+            ),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
 
 
 class VideoSwinSBackbone(VideoSwinBackbone):
@@ -73,7 +84,17 @@ def __new__(
     @classproperty
     def presets(cls):
         """Dictionary of preset names and configurations."""
-        return {}
+        return {
+            "videoswin_small_kinetics400": copy.deepcopy(
+                backbone_presets["videoswin_small_kinetics400"]
+            ),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
 
 
 class VideoSwinBBackbone(VideoSwinBackbone):
@@ -92,7 +113,17 @@ def __new__(
     @classproperty
     def presets(cls):
         """Dictionary of preset names and configurations."""
-        return {}
+        return {
+            "videoswin_base_kinetics400": copy.deepcopy(
+                backbone_presets["videoswin_base_kinetics400"]
+            ),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
 
 
 setattr(VideoSwinTBackbone, "__doc__", ALIAS_DOCSTRING.format(size="T"))
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
index 801c5de906..d76054f1fb 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -102,3 +102,8 @@
         },
     },
 }
+
+backbone_presets = {
+    **backbone_presets_no_weights,
+    **backbone_presets_with_weights,
+}
\ No newline at end of file

From 2eaf8b08d5ca60719450e93b722ccec950f027e0 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 23:10:51 +0600
Subject: [PATCH 33/94] update video swin layer test cases

---
 keras_cv/layers/video_swin_layers_test.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py
index 81bedcaa1b..eaae3c1fa2 100644
--- a/keras_cv/layers/video_swin_layers_test.py
+++ b/keras_cv/layers/video_swin_layers_test.py
@@ -41,9 +41,9 @@ def test_patch_embedding_get_config(self):
 
 
 class TestVideoSwinWindowAttention(TestCase):
-    @pytest.fixture
-    def window_attention_model(self):
-        return VideoSwinWindowAttention(
+
+    def setUp(self):
+        self.window_attention_model = VideoSwinWindowAttention(
             window_size=(2, 4, 4),
             num_heads=8,
             qkv_bias=True,
@@ -52,16 +52,16 @@ def window_attention_model(self):
             proj_drop_rate=0.1,
         )
 
-    def test_window_attention_output_shape(self, window_attention_model):
+    def test_window_attention_output_shape(self):
         input_shape = (4, 10, 256)
         input_array = ops.ones(input_shape)
-        output_shape = window_attention_model(input_array).shape
+        output_shape = self.window_attention_model(input_array).shape
         expected_output_shape = input_shape
         self.assertEqual(output_shape, expected_output_shape)
 
-    def test_window_attention_get_config(self, window_attention_model):
-        config = window_attention_model.get_config()
-        # Add assertions based on your specific requirements
+    def test_window_attention_get_config(self):
+        config = self.window_attention_model.get_config()
+        # Add assertions based on the specific requirements
         assert isinstance(config, dict)
         assert config["window_size"] == (2, 4, 4)
         assert config["num_heads"] == 8

From f713304465316d4219630bea75c84a4f12beb012 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 23:32:11 +0600
Subject: [PATCH 34/94] added patch merging test

---
 keras_cv/layers/video_swin_layers_test.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py
index eaae3c1fa2..951d46d159 100644
--- a/keras_cv/layers/video_swin_layers_test.py
+++ b/keras_cv/layers/video_swin_layers_test.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
 
 from keras_cv.backend import ops
 from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
 from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention
+from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
 from keras_cv.tests.test_case import TestCase
 
 
@@ -68,4 +68,22 @@ def test_window_attention_get_config(self):
         assert config["qkv_bias"] is True
         assert config["qk_scale"] is None
         assert config["attn_drop_rate"] == 0.1
-        assert config["proj_drop_rate"] == 0.1
\ No newline at end of file
+        assert config["proj_drop_rate"] == 0.1
+
+
+class TestVideoSwinPatchMerging(TestCase):
+    def setUp(self):
+        self.patch_merging = VideoSwinPatchMerging(input_dim=32)
+
+    def test_output_shape(self):
+        input_shape = (2, 4, 32, 32, 3)
+        input_tensor = ops.ones(*input_shape)
+        output_shape = self.patch_merging(input_tensor).shape
+        expected_shape = (
+            input_shape[0], 
+            input_shape[1], 
+            input_shape[2] // 2, 
+            input_shape[3] // 2, 
+            input_shape[4] * 4
+        )
+        self.assertEqual(output_shape, expected_shape)

From 44dae81a76469b5842d7d5ee5790db314b3951dd Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 23:35:49 +0600
Subject: [PATCH 35/94] imported video swins presets to backbone presets list"

---
 keras_cv/layers/video_swin_layers.py          | 3 ++-
 keras_cv/models/backbones/backbone_presets.py | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index af682ea27c..ff4dd212dc 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -887,4 +887,5 @@ def get_config(self):
                 "activation": self._activation_identifier
             }
         )
-        return config
\ No newline at end of file
+        return config
+    
\ No newline at end of file
diff --git a/keras_cv/models/backbones/backbone_presets.py b/keras_cv/models/backbones/backbone_presets.py
index 95d3ccd522..93d9595c6f 100644
--- a/keras_cv/models/backbones/backbone_presets.py
+++ b/keras_cv/models/backbones/backbone_presets.py
@@ -30,6 +30,7 @@
 from keras_cv.models.backbones.resnet_v2 import resnet_v2_backbone_presets
 from keras_cv.models.backbones.vit_det import vit_det_backbone_presets
 from keras_cv.models.object_detection.yolo_v8 import yolo_v8_backbone_presets
+from keras_cv.models.backbones.video_swin import video_swin_backbone_presets
 
 backbone_presets_no_weights = {
     **resnet_v1_backbone_presets.backbone_presets_no_weights,
@@ -42,6 +43,7 @@
     **efficientnet_lite_backbone_presets.backbone_presets_no_weights,
     **yolo_v8_backbone_presets.backbone_presets_no_weights,
     **vit_det_backbone_presets.backbone_presets_no_weights,
+    **video_swin_backbone_presets.backbone_presets_no_weights,
 }
 
 backbone_presets_with_weights = {
@@ -55,6 +57,7 @@
     **efficientnet_lite_backbone_presets.backbone_presets_with_weights,
     **yolo_v8_backbone_presets.backbone_presets_with_weights,
     **vit_det_backbone_presets.backbone_presets_with_weights,
+    **video_swin_backbone_presets.backbone_presets_with_weights,
 }
 
 backbone_presets = {

From daca84f121a516f80f36412e2a08c38f6a89e1af Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 23:45:17 +0600
Subject: [PATCH 36/94] fix: typos"

---
 keras_cv/layers/video_swin_layers.py              | 15 +++++++--------
 .../backbones/video_swin/video_swin_backbone.py   |  3 ++-
 .../video_swin/video_swin_backbone_presets.py     |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index ff4dd212dc..b3eb10d5ed 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -354,7 +354,6 @@ def call(self, x):
             [0, 0]
         ]
         x = ops.pad(x, paddings)
-
         x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
         x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
         x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
@@ -465,7 +464,6 @@ def call(self, x, mask=None, training=None):
         qkv = ops.reshape(qkv, [batch_size, depth, 3, self.num_heads, channel // self.num_heads])
         qkv = ops.transpose(qkv, [2, 0, 3, 1, 4])
         q, k, v = ops.split(qkv, 3, axis=0)
-
         q = ops.squeeze(q, axis=0) * self.scale
         k = ops.squeeze(k, axis=0)
         v = ops.squeeze(v, axis=0)
@@ -618,13 +616,13 @@ def compute_output_shape(self, input_shape):
         window_size, _ = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
         )
-        depth_p = self._compute_dim_padded(input_shape[1], window_size[0])
-        height_p = self._compute_dim_padded(input_shape[2], window_size[1])
-        width_p = self._compute_dim_padded(input_shape[3], window_size[2])
+        depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
+        height_pad = self._compute_dim_padded(input_shape[2], window_size[1])
+        width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
         
         if self.downsample is not None:
             output_shape = (
-                input_shape[0], depth_p, height_p // 2, width_p // 2, 2*self.input_dim
+                input_shape[0], depth_pad, height_pad // 2, width_pad // 2, 2*self.input_dim
             )
             return output_shape
         
@@ -736,6 +734,7 @@ def __init__(
 
         for i, (shift, window) in enumerate(zip(self.shift_size, self.window_size)):
             if not (0 <= shift < window):
+                # TODO: Add more description.
                 raise ValueError(
                     f"shift_size[{i}] must be in the range 0 to window_size[{i}]"
                 )
@@ -799,7 +798,7 @@ def first_forward(self, x, mask_matrix, training):
         x = ops.pad(x, paddings)
         
         input_shape = ops.shape(x)
-        depth_p, height_p, width_p =  (
+        depth_pad, height_pad, width_pad =  (
             input_shape[1],
             input_shape[2],
             input_shape[3],
@@ -827,7 +826,7 @@ def first_forward(self, x, mask_matrix, training):
 
         # reverse the swin windows
         shifted_x = window_reverse(
-            attn_windows, window_size, batch_size, depth_p, height_p, width_p
+            attn_windows, window_size, batch_size, depth_pad, height_pad, width_pad
         ) 
 
         # reverse cyclic shift
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 6efee5d5d6..baf44161cc 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -88,6 +88,7 @@ def __init__(
         qk_scale=None,
         **kwargs,
     ):
+        # Parse input specification.
         input_spec = utils.parse_model_inputs(
             input_shape, input_tensor, name="videos"
         )
@@ -95,7 +96,7 @@ def __init__(
         # Check that the input video is well specified.
         if input_spec.shape[-4] is None or input_spec.shape[-3] is None or input_spec.shape[-2] is None:
             raise ValueError(
-                "Depth, Height and width of the video must be specified"
+                "Depth, height and width of the video must be specified"
                 " in `input_shape`."
             )
         if input_spec.shape[-3] != input_spec.shape[-2]:
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
index d76054f1fb..bd06d137c3 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -106,4 +106,4 @@
 backbone_presets = {
     **backbone_presets_no_weights,
     **backbone_presets_with_weights,
-}
\ No newline at end of file
+}

From b1a5427771dca159ac4a4e8fa054b076bb1d90d8 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 2 Mar 2024 23:51:46 +0600
Subject: [PATCH 37/94] run formatters"

---
 keras_cv/models/backbones/backbone_presets.py |  2 +-
 .../video_swin/video_swin_aliases.py          |  5 ++-
 .../video_swin/video_swin_backbone.py         | 34 +++++++++++--------
 .../video_classifier_presets.py               | 22 +++++-------
 4 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/keras_cv/models/backbones/backbone_presets.py b/keras_cv/models/backbones/backbone_presets.py
index 93d9595c6f..b77163aa8f 100644
--- a/keras_cv/models/backbones/backbone_presets.py
+++ b/keras_cv/models/backbones/backbone_presets.py
@@ -28,9 +28,9 @@
 from keras_cv.models.backbones.mobilenet_v3 import mobilenet_v3_backbone_presets
 from keras_cv.models.backbones.resnet_v1 import resnet_v1_backbone_presets
 from keras_cv.models.backbones.resnet_v2 import resnet_v2_backbone_presets
+from keras_cv.models.backbones.video_swin import video_swin_backbone_presets
 from keras_cv.models.backbones.vit_det import vit_det_backbone_presets
 from keras_cv.models.object_detection.yolo_v8 import yolo_v8_backbone_presets
-from keras_cv.models.backbones.video_swin import video_swin_backbone_presets
 
 backbone_presets_no_weights = {
     **resnet_v1_backbone_presets.backbone_presets_no_weights,
diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index ca31cac84e..56db9ca743 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -13,11 +13,14 @@
 # limitations under the License.
 
 import copy
+
 from keras_cv.models.backbones.video_swin.video_swin_backbone import (
     VideoSwinBackbone,
 )
+from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import (
+    backbone_presets,
+)
 from keras_cv.utils.python_utils import classproperty
-from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets
 
 ALIAS_DOCSTRING = """VideoSwin{size}Backbone model.
 
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index baf44161cc..a6c0868699 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -16,10 +16,10 @@
 
 import numpy as np
 from keras import layers
-from keras_cv.backend import ops
 
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
+from keras_cv.backend import ops
 from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
 from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
 from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
@@ -33,7 +33,7 @@ class VideoSwinBackbone(Backbone):
 
     Args:
         input_shape (tuple[int], optional): The size of the input image in
-            `(depth, height, width, channel)` format. 
+            `(depth, height, width, channel)` format.
             Defaults to `(32, 224, 224, 3)`.
         input_tensor (KerasTensor, optional): Output of
             `keras.layers.Input()`) to use as image input for the model.
@@ -51,18 +51,18 @@ class VideoSwinBackbone(Backbone):
         num_heads (tuple[int]): Number of attention head of each stage.
             Default to [3, 6, 12, 24]
         window_size (int): Window size. Default to [8, 7, 7].
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. 
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
             Default to 4.
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. 
+        qkv_bias (bool): If True, add a learnable bias to query, key, value.
             Default to True.
         qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
             Default to None.
         drop_rate (float): Dropout rate.
         attn_drop_rate (float): Attention dropout rate. Default: 0.
         drop_path_rate (float): Stochastic depth rate. Default: 0.2.
-        patch_norm (bool): If True, add normalization after patch embedding. 
+        patch_norm (bool): If True, add normalization after patch embedding.
             Default to False.
- 
+
     References:
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Official Code](https://github.com/SwinTransformer/Video-Swin-Transformer)
@@ -94,7 +94,11 @@ def __init__(
         )
 
         # Check that the input video is well specified.
-        if input_spec.shape[-4] is None or input_spec.shape[-3] is None or input_spec.shape[-2] is None:
+        if (
+            input_spec.shape[-4] is None
+            or input_spec.shape[-3] is None
+            or input_spec.shape[-2] is None
+        ):
             raise ValueError(
                 "Depth, height and width of the video must be specified"
                 " in `input_shape`."
@@ -112,11 +116,11 @@ def __init__(
             # Use common rescaling strategy across keras_cv
             x = keras.layers.Rescaling(1.0 / 255.0)(x)
 
-            # Video Swin scales inputs based on the standard ImageNet mean/stddev.
-            # Officially, Videw Swin takes tensor of [0-255] ranges. 
-            # And use mean=[123.675, 116.28, 103.53] and 
-            # std=[58.395, 57.12, 57.375] for normalization. 
-            # So, if include_rescaling is set to True, then, to match with the 
+            # VideoSwin scales inputs based on the ImageNet mean/stddev.
+            # Officially, Videw Swin takes tensor of [0-255] ranges.
+            # And use mean=[123.675, 116.28, 103.53] and
+            # std=[58.395, 57.12, 57.375] for normalization.
+            # So, if include_rescaling is set to True, then, to match with the
             # official scores, following normalization should be added.
             x = (x - ops.array([0.485, 0.456, 0.406], dtype=x.dtype)) / (
                 ops.array([0.229, 0.224, 0.225], dtype=x.dtype)
@@ -147,9 +151,9 @@ def __init__(
                 attn_drop_rate=attn_drop_rate,
                 drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])],
                 norm_layer=norm_layer,
-                downsample=VideoSwinPatchMerging
-                if (i < num_layers - 1)
-                else None,
+                downsample=(
+                    VideoSwinPatchMerging if (i < num_layers - 1) else None
+                ),
                 name=f"videoswin_basic_layer_{i + 1}",
             )
             x = layer(x)
diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py
index 2a8447bd20..384373c1f9 100644
--- a/keras_cv/models/classification/video_classifier_presets.py
+++ b/keras_cv/models/classification/video_classifier_presets.py
@@ -16,9 +16,7 @@
 classifier_presets = {
     "videoswin_tiny_kinetics_classifier": {
         "metadata": {
-            "description": (
-                "videoswin_tiny_kinetics " # TODO: update
-            ),
+            "description": ("videoswin_tiny_kinetics "),  # TODO: update
             "params": 25_613_800,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
@@ -26,20 +24,16 @@
     },
     "videoswin_small_kinetics_classifier": {
         "metadata": {
-            "description": (
-                "videoswin_small_kinetics " # TODO: update
-            ),
-            "params": 25_613_800, # TODO: update
+            "description": ("videoswin_small_kinetics "),  # TODO: update
+            "params": 25_613_800,  # TODO: update
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
     },
     "videoswin_base_kinetics_classifier": {
         "metadata": {
-            "description": (
-                "videoswin_base_kinetics " # TODO: update
-            ),
-            "params": 25_613_800, # TODO: update
+            "description": ("videoswin_base_kinetics "),  # TODO: update
+            "params": 25_613_800,  # TODO: update
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
@@ -47,11 +41,11 @@
     "videoswin_base_something_something_v2_classifier": {
         "metadata": {
             "description": (
-                "videoswin_base_something_something_v2 " # TODO: update
+                "videoswin_base_something_something_v2 "  # TODO: update
             ),
-            "params": 25_613_800, # TODO: update
+            "params": 25_613_800,  # TODO: update
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
     },
-}
\ No newline at end of file
+}

From c66673c8e3f7e8537ab72271885fb85de183b9a5 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 3 Mar 2024 00:04:55 +0600
Subject: [PATCH 38/94] fix: linting issue

---
 keras_cv/layers/__init__.py                        |  6 +++---
 keras_cv/layers/video_swin_layers_test.py          | 12 ++++++------
 keras_cv/models/classification/video_classifier.py |  9 +++++----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py
index 957f5eda3c..ae4f6007f5 100644
--- a/keras_cv/layers/__init__.py
+++ b/keras_cv/layers/__init__.py
@@ -135,12 +135,12 @@
 )
 from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling
 from keras_cv.layers.transformer_encoder import TransformerEncoder
+from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
+from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
+from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
 from keras_cv.layers.vit_det_layers import AddRelativePositionalEmbedding
 from keras_cv.layers.vit_det_layers import MultiHeadAttentionWithRelativePE
 from keras_cv.layers.vit_det_layers import ViTDetPatchingAndEmbedding
 from keras_cv.layers.vit_det_layers import WindowedTransformerEncoder
 from keras_cv.layers.vit_det_layers import WindowPartitioning
 from keras_cv.layers.vit_layers import PatchingAndEmbedding
-from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
-from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
-from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
\ No newline at end of file
diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py
index 951d46d159..cfebaabb3b 100644
--- a/keras_cv/layers/video_swin_layers_test.py
+++ b/keras_cv/layers/video_swin_layers_test.py
@@ -15,8 +15,8 @@
 
 from keras_cv.backend import ops
 from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
-from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention
 from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
+from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention
 from keras_cv.tests.test_case import TestCase
 
 
@@ -80,10 +80,10 @@ def test_output_shape(self):
         input_tensor = ops.ones(*input_shape)
         output_shape = self.patch_merging(input_tensor).shape
         expected_shape = (
-            input_shape[0], 
-            input_shape[1], 
-            input_shape[2] // 2, 
-            input_shape[3] // 2, 
-            input_shape[4] * 4
+            input_shape[0],
+            input_shape[1],
+            input_shape[2] // 2,
+            input_shape[3] // 2,
+            input_shape[4] * 4,
         )
         self.assertEqual(output_shape, expected_shape)
diff --git a/keras_cv/models/classification/video_classifier.py b/keras_cv/models/classification/video_classifier.py
index 2d5b7f61ea..6313c76977 100644
--- a/keras_cv/models/classification/video_classifier.py
+++ b/keras_cv/models/classification/video_classifier.py
@@ -17,8 +17,6 @@
 
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
-from keras_cv.models.task import Task
-from keras_cv.utils.python_utils import classproperty
 from keras_cv.models.backbones.backbone_presets import backbone_presets
 from keras_cv.models.backbones.backbone_presets import (
     backbone_presets_with_weights,
@@ -26,6 +24,9 @@
 from keras_cv.models.classification.video_classifier_presets import (
     classifier_presets,
 )
+from keras_cv.models.task import Task
+from keras_cv.utils.python_utils import classproperty
+
 
 @keras_cv_export(
     [
@@ -98,7 +99,7 @@ def __init__(
             num_classes,
             activation=activation,
             name="predictions",
-            dtype='float32'
+            dtype="float32",
         )(x)
 
         # Instantiate using Functional API Model constructor
@@ -143,4 +144,4 @@ def presets_with_weights(cls):
     def backbone_presets(cls):
         """Dictionary of preset names and configurations of compatible
         backbones."""
-        return copy.deepcopy(backbone_presets)
\ No newline at end of file
+        return copy.deepcopy(backbone_presets)

From 84d4e03880622220c7862d1eb0b621e9858a75f8 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 3 Mar 2024 00:12:45 +0600
Subject: [PATCH 39/94] fix: linting issue

---
 keras_cv/layers/video_swin_layers.py | 419 ++++++++++++++-------------
 1 file changed, 221 insertions(+), 198 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index b3eb10d5ed..bf6bcaf960 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -170,12 +170,7 @@ class MLP(layers.Layer):
     """  # noqa: E501
 
     def __init__(
-        self,
-        hidden_dim,
-        output_dim,
-        drop_rate=0.0,
-        activation='gelu',
-        **kwargs
+        self, hidden_dim, output_dim, drop_rate=0.0, activation="gelu", **kwargs
     ):
         super().__init__(**kwargs)
         self.output_dim = output_dim
@@ -186,7 +181,7 @@ def __init__(
         self.fc1 = layers.Dense(self.hidden_dim)
         self.fc2 = layers.Dense(self.output_dim)
         self.dropout = layers.Dropout(self.drop_rate)
-        
+
     def build(self, input_shape):
         self.fc1.build(input_shape)
         self.fc2.build((*input_shape[1:-1], self.hidden_dim))
@@ -199,19 +194,19 @@ def call(self, x, training=None):
         x = self.fc2(x)
         x = self.dropout(x, training=training)
         return x
-    
+
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "output_dim": self.output_dim, 
+                "output_dim": self.output_dim,
                 "hidden_dim": self.hidden_dim,
                 "drop_rate": self.drop_rate,
-                'activation': self._activation_identifier
+                "activation": self._activation_identifier,
             }
         )
         return config
-    
+
 
 @keras_cv_export(
     "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers"
@@ -228,13 +223,9 @@ class VideoSwinPatchingAndEmbedding(keras.Model):
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
     """  # noqa: E501
-        
+
     def __init__(
-        self, 
-        patch_size=(2, 4, 4), 
-        embed_dim=96, 
-        norm_layer=None,
-        **kwargs
+        self, patch_size=(2, 4, 4), embed_dim=96, norm_layer=None, **kwargs
     ):
         super().__init__(**kwargs)
         self.patch_size = patch_size
@@ -243,52 +234,50 @@ def __init__(
 
     def _compute_padding(self, dim, patch_size):
         pad_amount = patch_size - (dim % patch_size)
-        return [
-            0, pad_amount if pad_amount != patch_size else 0
-        ]
+        return [0, pad_amount if pad_amount != patch_size else 0]
 
     def build(self, input_shape):
         self.pads = [
-            [0, 0], 
+            [0, 0],
             self._compute_padding(input_shape[1], self.patch_size[0]),
             self._compute_padding(input_shape[2], self.patch_size[1]),
             self._compute_padding(input_shape[3], self.patch_size[2]),
-            [0, 0]   
+            [0, 0],
         ]
-        
+
         self.proj = layers.Conv3D(
-            self.embed_dim, 
+            self.embed_dim,
             kernel_size=self.patch_size,
-            strides=self.patch_size, 
-            name='embed_proj'
+            strides=self.patch_size,
+            name="embed_proj",
         )
         self.proj.build((None, None, None, None, input_shape[-1]))
-        
+
         self.norm = None
         if self.norm_layer is not None:
             self.norm = self.norm_layer(
-                axis=-1, epsilon=1e-5, name='embed_norm'
-            )
-            self.norm.build(
-                (None, None, None, None, self.embed_dim)
+                axis=-1, epsilon=1e-5, name="embed_norm"
             )
+            self.norm.build((None, None, None, None, self.embed_dim))
         self.built = True
 
     def call(self, x):
         x = ops.pad(x, self.pads)
         x = self.proj(x)
-        
+
         if self.norm is not None:
-            x = self.norm(x) 
-            
+            x = self.norm(x)
+
         return x
-    
+
     def compute_output_shape(self, input_shape):
         spatial_dims = [
             (dim - self.patch_size[i]) // self.patch_size[i] + 1
             for i, dim in enumerate(input_shape[1:-1])
         ]
-        output_shape = (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,)
+        output_shape = (
+            (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,)
+        )
         return output_shape
 
     def get_config(self):
@@ -300,27 +289,22 @@ def get_config(self):
             }
         )
         return config
-    
+
 
 class VideoSwinPatchMerging(layers.Layer):
     """Patch Merging Layer.
 
     Args:
         input_dim (int): Number of input channels.
-        norm_layer (keras.layers, optional): Normalization layer. 
+        norm_layer (keras.layers, optional): Normalization layer.
             Default: LayerNormalization
 
     References:
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
     """  # noqa: E501
-        
-    def __init__(
-        self, 
-        input_dim, 
-        norm_layer=None,
-        **kwargs
-    ):
+
+    def __init__(self, input_dim, norm_layer=None, **kwargs):
         super().__init__(**kwargs)
         self.input_dim = input_dim
         self.norm_layer = norm_layer
@@ -328,16 +312,21 @@ def __init__(
     def build(self, input_shape):
         batch_size, depth, height, width, channel = input_shape
         self.reduction = layers.Dense(2 * self.input_dim, use_bias=False)
-        self.reduction.build((batch_size, depth, height // 2, width // 2, 4 * channel))
+        self.reduction.build(
+            (batch_size, depth, height // 2, width // 2, 4 * channel)
+        )
         self.norm = self.norm_layer(axis=-1, epsilon=1e-5)
-        self.norm.build((batch_size, depth, height // 2, width // 2, 4 * channel))
-        self.built=True
-        
+        self.norm.build(
+            (batch_size, depth, height // 2, width // 2, 4 * channel)
+        )
+        self.built = True
+
     def call(self, x):
-        """ The call function.
+        """The call function.
 
         Args:
-            x: Input feature, shape: (batch_size, depth, height, width, channel).
+            x: Input feature,
+            shape: (batch_size, depth, height, width, channel).
         """
         input_shape = ops.shape(x)
         height, width = (
@@ -347,11 +336,11 @@ def call(self, x):
 
         # padding if needed
         paddings = [
-            [0, 0], 
-            [0, 0], 
-            [0, ops.mod(height, 2)], 
-            [0, ops.mod(width, 2)], 
-            [0, 0]
+            [0, 0],
+            [0, 0],
+            [0, ops.mod(height, 2)],
+            [0, ops.mod(width, 2)],
+            [0, 0],
         ]
         x = ops.pad(x, paddings)
         x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
@@ -362,7 +351,7 @@ def call(self, x):
         x = self.norm(x)
         x = self.reduction(x)
         return x
-    
+
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -371,7 +360,7 @@ def get_config(self):
             }
         )
         return config
-    
+
 
 class VideoSwinWindowAttention(keras.Model):
     """Window based multi-head self attention (W-MSA) module with relative position bias.
@@ -389,17 +378,17 @@ class VideoSwinWindowAttention(keras.Model):
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
     """  # noqa: E501
-        
+
     def __init__(
-        self, 
-        input_dim, 
-        window_size, 
-        num_heads, 
-        qkv_bias=True, 
-        qk_scale=None, 
-        attn_drop_rate=0., 
-        proj_drop_rate=0.,
-        **kwargs
+        self,
+        input_dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop_rate=0.0,
+        proj_drop_rate=0.0,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         # variables
@@ -408,31 +397,43 @@ def __init__(
         self.num_heads = num_heads
         head_dim = input_dim // num_heads
         self.qk_scale = qk_scale
-        self.scale = qk_scale or head_dim ** -0.5
+        self.scale = qk_scale or head_dim**-0.5
         self.qkv_bias = qkv_bias
         self.attn_drop_rate = attn_drop_rate
         self.proj_drop_rate = proj_drop_rate
 
-    def get_relative_position_index(self, window_depth, window_height, window_width):
+    def get_relative_position_index(
+        self, window_depth, window_height, window_width
+    ):
         y_y, z_z, x_x = ops.meshgrid(
-            ops.arange(window_width), ops.arange(window_depth), ops.arange(window_height)
+            ops.arange(window_width),
+            ops.arange(window_depth),
+            ops.arange(window_height),
         )
         coords = ops.stack([z_z, y_y, x_x], axis=0)
         coords_flatten = ops.reshape(coords, [3, -1])
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )
         relative_coords = ops.transpose(relative_coords, axes=[1, 2, 0])
-        z_z = (relative_coords[:, :, 0] + window_depth  - 1) * (2 * window_height - 1) * (2 * window_width - 1)
-        x_x = (relative_coords[:, :, 1] + window_height - 1) * (2 * window_width - 1)
-        y_y = (relative_coords[:, :, 2] + window_width  - 1)
+        z_z = (
+            (relative_coords[:, :, 0] + window_depth - 1)
+            * (2 * window_height - 1)
+            * (2 * window_width - 1)
+        )
+        x_x = (relative_coords[:, :, 1] + window_height - 1) * (
+            2 * window_width - 1
+        )
+        y_y = relative_coords[:, :, 2] + window_width - 1
         relative_coords = ops.stack([z_z, x_x, y_y], axis=-1)
         return ops.sum(relative_coords, axis=-1)
 
     def build(self, input_shape):
         self.relative_position_bias_table = self.add_weight(
             shape=(
-                (2 * self.window_size[0] - 1) * 
-                (2 * self.window_size[1] - 1) * 
-                (2 * self.window_size[2] - 1),
+                (2 * self.window_size[0] - 1)
+                * (2 * self.window_size[1] - 1)
+                * (2 * self.window_size[2] - 1),
                 self.num_heads,
             ),
             initializer="zeros",
@@ -442,14 +443,14 @@ def build(self, input_shape):
         self.relative_position_index = self.get_relative_position_index(
             self.window_size[0], self.window_size[1], self.window_size[2]
         )
-        
+
         # layers
         self.qkv = layers.Dense(self.input_dim * 3, use_bias=self.qkv_bias)
         self.attn_drop = layers.Dropout(self.attn_drop_rate)
         self.proj = layers.Dense(self.input_dim)
         self.proj_drop = layers.Dropout(self.proj_drop_rate)
         self.qkv.build(input_shape)
-        self.proj.build(input_shape) 
+        self.proj.build(input_shape)
         self.built = True
 
     def call(self, x, mask=None, training=None):
@@ -459,30 +460,43 @@ def call(self, x, mask=None, training=None):
             input_shape[1],
             input_shape[2],
         )
-        
+
         qkv = self.qkv(x)
-        qkv = ops.reshape(qkv, [batch_size, depth, 3, self.num_heads, channel // self.num_heads])
+        qkv = ops.reshape(
+            qkv,
+            [batch_size, depth, 3, self.num_heads, channel // self.num_heads],
+        )
         qkv = ops.transpose(qkv, [2, 0, 3, 1, 4])
         q, k, v = ops.split(qkv, 3, axis=0)
         q = ops.squeeze(q, axis=0) * self.scale
         k = ops.squeeze(k, axis=0)
         v = ops.squeeze(v, axis=0)
         attn = ops.matmul(q, ops.transpose(k, [0, 1, 3, 2]))
-        
+
         rel_pos_bias = ops.take(
-            self.relative_position_bias_table, self.relative_position_index[:depth, :depth]
+            self.relative_position_bias_table,
+            self.relative_position_index[:depth, :depth],
         )
         rel_pos_bias = ops.reshape(rel_pos_bias, [depth, depth, -1])
         rel_pos_bias = ops.transpose(rel_pos_bias, [2, 0, 1])
         attn = attn + rel_pos_bias[None, ...]
-  
+
         if mask is not None:
             mask_size = ops.shape(mask)[0]
             mask = ops.cast(mask, dtype=attn.dtype)
-            attn = ops.reshape(
-                attn, 
-                [batch_size // mask_size, mask_size, self.num_heads, depth, depth]
-            ) + mask[:, None, :, :]
+            attn = (
+                ops.reshape(
+                    attn,
+                    [
+                        batch_size // mask_size,
+                        mask_size,
+                        self.num_heads,
+                        depth,
+                        depth,
+                    ],
+                )
+                + mask[:, None, :, :]
+            )
             attn = ops.reshape(attn, [-1, self.num_heads, depth, depth])
 
         attn = keras.activations.softmax(attn, axis=-1)
@@ -493,7 +507,7 @@ def call(self, x, mask=None, training=None):
         x = self.proj(x)
         x = self.proj_drop(x, training=training)
         return x
-    
+
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -508,7 +522,7 @@ def get_config(self):
             }
         )
         return config
-    
+
 
 class VideoSwinBasicLayer(keras.Model):
     """A basic Swin Transformer layer for one stage.
@@ -531,22 +545,22 @@ class VideoSwinBasicLayer(keras.Model):
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
     """  # noqa: E501
-    
+
     def __init__(
         self,
         input_dim,
         depth,
         num_heads,
-        window_size=(1,7,7),
-        mlp_ratio=4.,
+        window_size=(1, 7, 7),
+        mlp_ratio=4.0,
         qkv_bias=False,
         qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
         norm_layer=None,
         downsample=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.input_dim = input_dim
@@ -562,15 +576,14 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.norm_layer = norm_layer
         self.downsample = downsample
-    
+
     def _compute_dim_padded(self, input_dim, window_dim_size):
         input_dim = ops.cast(input_dim, dtype="float32")
         window_dim_size = ops.cast(window_dim_size, dtype="float32")
         return ops.cast(
-            ops.ceil(input_dim / window_dim_size) * window_dim_size,
-            "int32"
+            ops.ceil(input_dim / window_dim_size) * window_dim_size, "int32"
         )
-    
+
     def build(self, input_shape):
         window_size, shift_size = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
@@ -581,20 +594,24 @@ def build(self, input_shape):
         self.attn_mask = compute_mask(
             depth_pad, height_pad, width_pad, window_size, shift_size
         )
-        
+
         # build blocks
         self.blocks = [
             VideoSwinTransformerBlock(
                 self.input_dim,
                 num_heads=self.num_heads,
                 window_size=self.window_size,
-                shift_size=(0,0,0) if (i % 2 == 0) else self.shift_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
                 mlp_ratio=self.mlp_ratio,
                 qkv_bias=self.qkv_bias,
                 qk_scale=self.qk_scale,
                 drop_rate=self.drop_rate,
                 attn_drop_rate=self.attn_drop_rate,
-                drop_path_rate=self.drop_path_rate[i] if isinstance(self.drop_path_rate, list) else self.drop_path_rate,
+                drop_path_rate=(
+                    self.drop_path_rate[i]
+                    if isinstance(self.drop_path_rate, list)
+                    else self.drop_path_rate
+                ),
                 norm_layer=self.norm_layer,
             )
             for i in range(self.depth)
@@ -605,13 +622,12 @@ def build(self, input_shape):
                 input_dim=self.input_dim, norm_layer=self.norm_layer
             )
             self.downsample.build(input_shape)
-            
+
         for i in range(self.depth):
             self.blocks[i].build(input_shape)
-        
+
         self.built = True
-        
-        
+
     def compute_output_shape(self, input_shape):
         window_size, _ = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
@@ -619,19 +635,23 @@ def compute_output_shape(self, input_shape):
         depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
         height_pad = self._compute_dim_padded(input_shape[2], window_size[1])
         width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
-        
+
         if self.downsample is not None:
             output_shape = (
-                input_shape[0], depth_pad, height_pad // 2, width_pad // 2, 2*self.input_dim
+                input_shape[0],
+                depth_pad,
+                height_pad // 2,
+                width_pad // 2,
+                2 * self.input_dim,
             )
             return output_shape
-        
+
         return input_shape
 
     def call(self, x, training=None):
         input_shape = ops.shape(x)
-        batch_size, depth, height, width, channel = (
-            input_shape[0], 
+        batch_size, depth, height, width, _ = (
+            input_shape[0],
             input_shape[1],
             input_shape[2],
             input_shape[3],
@@ -639,22 +659,15 @@ def call(self, x, training=None):
         )
 
         for block in self.blocks:
-            x = block(
-                x, 
-                self.attn_mask,
-                training=training
-            )
+            x = block(x, self.attn_mask, training=training)
+
+        x = ops.reshape(x, [batch_size, depth, height, width, -1])
 
-        x = ops.reshape(
-            x, [batch_size, depth, height, width, -1]
-        )
- 
         if self.downsample is not None:
             x = self.downsample(x)
-            
+
         return x
-    
-    
+
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -669,11 +682,11 @@ def get_config(self):
                 "qk_scale": self.qk_scale,
                 "drop": self.drop,
                 "attn_drop": self.attn_drop,
-                "drop_path": self.drop_path
+                "drop_path": self.drop_path,
             }
         )
         return config
-    
+
 
 @keras_cv_export(
     "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers"
@@ -699,22 +712,22 @@ class VideoSwinTransformerBlock(keras.Model):
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Video Swin Transformer GitHub](https://github.com/SwinTransformer/Video-Swin-Transformer)
     """  # noqa: E501
-    
+
     def __init__(
-        self, 
-        input_dim, 
-        num_heads, 
-        window_size=(2, 7, 7), 
+        self,
+        input_dim,
+        num_heads,
+        window_size=(2, 7, 7),
         shift_size=(0, 0, 0),
-        mlp_ratio=4., 
-        qkv_bias=True, 
-        qk_scale=None, 
-        drop_rate=0., 
-        attn_drop_rate=0., 
-        drop_path_rate=0.,
-        activation='gelu',
-        norm_layer=layers.LayerNormalization, 
-        **kwargs
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        activation="gelu",
+        norm_layer=layers.LayerNormalization,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         # variables
@@ -732,55 +745,62 @@ def __init__(
         self.norm_layer = norm_layer
         self._activation_identifier = activation
 
-        for i, (shift, window) in enumerate(zip(self.shift_size, self.window_size)):
+        for i, (shift, window) in enumerate(
+            zip(self.shift_size, self.window_size)
+        ):
             if not (0 <= shift < window):
                 # TODO: Add more description.
                 raise ValueError(
-                    f"shift_size[{i}] must be in the range 0 to window_size[{i}]"
+                    f"shift_size[{i}] must be in the "
+                    "range 0 to window_size[{i}]"
                 )
 
     def build(self, input_shape):
         self.window_size, self.shift_size = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
         )
-        
+
         self.apply_cyclic_shift = False
         if any(i > 0 for i in self.shift_size):
             self.apply_cyclic_shift = True
-   
+
         # layers
-        self.drop_path = DropPath(self.drop_path_rate) if self.drop_path_rate > 0. else layers.Identity()  
-        
+        self.drop_path = (
+            DropPath(self.drop_path_rate)
+            if self.drop_path_rate > 0.0
+            else layers.Identity()
+        )
+
         self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05)
         self.norm1.build(input_shape)
-        
+
         self.attn = VideoSwinWindowAttention(
-            self.input_dim, 
-            window_size=self.window_size, 
-            num_heads=self.num_heads, 
-            qkv_bias=self.qkv_bias, 
-            qk_scale=self.qk_scale, 
+            self.input_dim,
+            window_size=self.window_size,
+            num_heads=self.num_heads,
+            qkv_bias=self.qkv_bias,
+            qk_scale=self.qk_scale,
             attn_drop_rate=self.attn_drop_rate,
-            proj_drop_rate=self.drop_rate
+            proj_drop_rate=self.drop_rate,
         )
         self.attn.build((None, None, self.input_dim))
-        
+
         self.norm2 = self.norm_layer(axis=-1, epsilon=1e-05)
         self.norm2.build((*input_shape[1:-1], self.input_dim))
-        
+
         self.mlp = MLP(
-            output_dim=self.input_dim, 
-            hidden_dim=self.mlp_hidden_dim, 
+            output_dim=self.input_dim,
+            hidden_dim=self.mlp_hidden_dim,
             activation=self._activation_identifier,
-            drop_rate=self.drop_rate
+            drop_rate=self.drop_rate,
         )
         self.mlp.build((*input_shape[1:-1], self.input_dim))
         self.built = True
-        
+
     def first_forward(self, x, mask_matrix, training):
         input_shape = ops.shape(x)
-        batch_size, depth, height, width, channel = (
-            input_shape[0], 
+        batch_size, depth, height, width, _ = (
+            input_shape[0],
             input_shape[1],
             input_shape[2],
             input_shape[3],
@@ -788,66 +808,73 @@ def first_forward(self, x, mask_matrix, training):
         )
         window_size, shift_size = self.window_size, self.shift_size
         x = self.norm1(x)
-        
+
         # pad feature maps to multiples of window size
-        pad_l  = pad_t = pad_d0 = 0
+        pad_l = pad_t = pad_d0 = 0
         pad_d1 = ops.mod(-depth + window_size[0], window_size[0])
-        pad_b  = ops.mod(-height + window_size[1], window_size[1])
-        pad_r  = ops.mod(-width + window_size[2], window_size[2])
-        paddings = [[0, 0], [pad_d0, pad_d1], [pad_t, pad_b], [pad_l, pad_r], [0, 0]]
+        pad_b = ops.mod(-height + window_size[1], window_size[1])
+        pad_r = ops.mod(-width + window_size[2], window_size[2])
+        paddings = [
+            [0, 0],
+            [pad_d0, pad_d1],
+            [pad_t, pad_b],
+            [pad_l, pad_r],
+            [0, 0],
+        ]
         x = ops.pad(x, paddings)
-        
+
         input_shape = ops.shape(x)
-        depth_pad, height_pad, width_pad =  (
+        depth_pad, height_pad, width_pad = (
             input_shape[1],
             input_shape[2],
             input_shape[3],
         )
-        
+
         # cyclic shift
         if self.apply_cyclic_shift:
             shifted_x = ops.roll(
-                x, 
-                shift=(-shift_size[0], -shift_size[1], -shift_size[2]), 
-                axis=(1, 2, 3)
+                x,
+                shift=(-shift_size[0], -shift_size[1], -shift_size[2]),
+                axis=(1, 2, 3),
             )
             attn_mask = mask_matrix
         else:
             shifted_x = x
             attn_mask = None
-        
+
         # partition windows
-        x_windows = window_partition(shifted_x, window_size) 
-        
+        x_windows = window_partition(shifted_x, window_size)
+
         # get attentions params
-        attn_windows = self.attn(
-            x_windows, mask=attn_mask, training=training
-        ) 
+        attn_windows = self.attn(x_windows, mask=attn_mask, training=training)
 
         # reverse the swin windows
         shifted_x = window_reverse(
-            attn_windows, window_size, batch_size, depth_pad, height_pad, width_pad
-        ) 
+            attn_windows,
+            window_size,
+            batch_size,
+            depth_pad,
+            height_pad,
+            width_pad,
+        )
 
         # reverse cyclic shift
         if self.apply_cyclic_shift:
             x = ops.roll(
-                shifted_x, 
-                shift=(shift_size[0], shift_size[1], shift_size[2]), 
-                axis=(1, 2, 3)
+                shifted_x,
+                shift=(shift_size[0], shift_size[1], shift_size[2]),
+                axis=(1, 2, 3),
             )
         else:
             x = shifted_x
 
-        # pad if required    
+        # pad if required
         do_pad = ops.logical_or(
             ops.greater(pad_d1, 0),
-            ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0))
+            ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0)),
         )
         x = ops.cond(
-            do_pad, 
-            lambda: x[:, :depth, :height, :width, :], 
-            lambda: x
+            do_pad, lambda: x[:, :depth, :height, :width, :], lambda: x
         )
 
         return x
@@ -857,17 +884,14 @@ def second_forward(self, x, training):
         x = self.mlp(x)
         x = self.drop_path(x, training=training)
         return x
-        
 
     def call(self, x, mask_matrix=None, training=None):
         shortcut = x
-        x = self.first_forward(
-            x, mask_matrix, training
-        )
+        x = self.first_forward(x, mask_matrix, training)
         x = shortcut + self.drop_path(x)
         x = x + self.second_forward(x, training)
         return x
-    
+
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -879,12 +903,11 @@ def get_config(self):
                 "mlp_ratio": self.mlp_ratio,
                 "qkv_bias": self.qkv_bias,
                 "qk_scale": self.qk_scale,
-                "drop_rate": self.drop_rate, 
+                "drop_rate": self.drop_rate,
                 "attn_drop_rate": self.attn_drop_rate,
                 "drop_path_rate": self.drop_path_rate,
                 "mlp_hidden_dim": self.mlp_hidden_dim,
-                "activation": self._activation_identifier
+                "activation": self._activation_identifier,
             }
         )
         return config
-    
\ No newline at end of file

From d126b7c9693e6a83ffee3c3eb7e65204cf3896c4 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 3 Mar 2024 15:24:45 +0600
Subject: [PATCH 40/94] fix: video swin layer test cases"

---
 keras_cv/layers/video_swin_layers.py      | 16 +++++++++++-----
 keras_cv/layers/video_swin_layers_test.py |  7 ++++---
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index bf6bcaf960..17cb92ebd3 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -315,10 +315,13 @@ def build(self, input_shape):
         self.reduction.build(
             (batch_size, depth, height // 2, width // 2, 4 * channel)
         )
-        self.norm = self.norm_layer(axis=-1, epsilon=1e-5)
-        self.norm.build(
-            (batch_size, depth, height // 2, width // 2, 4 * channel)
-        )
+
+        self.norm = None
+        if self.norm_layer is not None:
+            self.norm = self.norm_layer(axis=-1, epsilon=1e-5)
+            self.norm.build(
+                (batch_size, depth, height // 2, width // 2, 4 * channel)
+            )
         self.built = True
 
     def call(self, x):
@@ -348,7 +351,10 @@ def call(self, x):
         x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
         x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
         x = ops.concatenate([x0, x1, x2, x3], axis=-1)  # B D H/2 W/2 4*C
-        x = self.norm(x)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
         x = self.reduction(x)
         return x
 
diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py
index cfebaabb3b..862c2e3a77 100644
--- a/keras_cv/layers/video_swin_layers_test.py
+++ b/keras_cv/layers/video_swin_layers_test.py
@@ -44,6 +44,7 @@ class TestVideoSwinWindowAttention(TestCase):
 
     def setUp(self):
         self.window_attention_model = VideoSwinWindowAttention(
+            input_dim=32,
             window_size=(2, 4, 4),
             num_heads=8,
             qkv_bias=True,
@@ -53,7 +54,7 @@ def setUp(self):
         )
 
     def test_window_attention_output_shape(self):
-        input_shape = (4, 10, 256)
+        input_shape = (2, 16, 32)
         input_array = ops.ones(input_shape)
         output_shape = self.window_attention_model(input_array).shape
         expected_output_shape = input_shape
@@ -77,13 +78,13 @@ def setUp(self):
 
     def test_output_shape(self):
         input_shape = (2, 4, 32, 32, 3)
-        input_tensor = ops.ones(*input_shape)
+        input_tensor = ops.ones(input_shape)
         output_shape = self.patch_merging(input_tensor).shape
         expected_shape = (
             input_shape[0],
             input_shape[1],
             input_shape[2] // 2,
             input_shape[3] // 2,
-            input_shape[4] * 4,
+            2 * 32,
         )
         self.assertEqual(output_shape, expected_shape)

From 61303be1a21b4647e5b3735f62decd1e75aadb59 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 3 Mar 2024 16:19:02 +0600
Subject: [PATCH 41/94] add: video swin backbone test

---
 .../video_swin/video_swin_backbone_test.py    | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 keras_cv/models/backbones/video_swin/video_swin_backbone_test.py

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
new file mode 100644
index 0000000000..f3b99be0b5
--- /dev/null
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -0,0 +1,60 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pytest
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinSBackbone
+from keras_cv.tests.test_case import TestCase
+
+class TestViTDetBackbone(TestCase):
+    @pytest.mark.large
+    def test_call(self):
+        model = VideoSwinSBackbone()
+        x = np.ones((1, 32, 224, 224, 3))
+        x_out = ops.convert_to_numpy(model(x))
+        num_parameters = sum(
+            np.prod(tuple(x.shape)) for x in model.trainable_variables
+        )
+        self.assertEqual(x_out.shape, (1, 16, 7, 7, 768))
+        self.assertEqual(num_parameters, 49_509_078)
+
+    @pytest.mark.extra_large
+    def teat_save(self):
+        # saving test
+        model = VideoSwinSBackbone()
+        x = np.ones((1, 32, 224, 224, 3))
+        x_out = ops.convert_to_numpy(model(x))
+        path = os.path.join(self.get_temp_dir(), "model.keras")
+        model.save(path)
+        loaded_model = keras.saving.load_model(path)
+        x_out_loaded = ops.convert_to_numpy(loaded_model(x))
+        self.assertAllClose(x_out, x_out_loaded)
+
+    @pytest.mark.extra_large
+    def test_fit(self):
+        model = VideoSwinSBackbone()
+        x = np.ones((1, 32, 224, 224, 3))
+        y = np.zeros((1, 16, 7, 7, 768))
+        model.compile(optimizer="adam", loss="mse", metrics=["mse"])
+        model.fit(x, y, epochs=1)
+
+    def test_pyramid_level_inputs_error(self):
+        model = VideoSwinSBackbone()
+        with self.assertRaises(NotImplementedError, msg="doesn't compute"):
+            model.pyramid_level_inputs

From af5878cb79520ca4e18b952cc213022a50c354a3 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 3 Mar 2024 17:37:30 +0600
Subject: [PATCH 42/94] rm redundant code

---
 keras_cv/layers/video_swin_layers.py               |  2 +-
 .../video_swin/video_swin_backbone_test.py         |  5 ++++-
 .../classification/video_classifier_presets.py     | 14 +++++++-------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 17cb92ebd3..09766c95b2 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -97,7 +97,7 @@ def get_window_size(x_size, window_size, shift_size=None):
     <https://arxiv.org/abs/2103.14030>"
     https://github.com/microsoft/Swin-Transformer
 
-     Args:
+    Args:
         x_size: input size.
         window_size: local window size.
         shift_size: window shifting size.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index f3b99be0b5..9ad7fd32b6 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -19,9 +19,12 @@
 
 from keras_cv.backend import keras
 from keras_cv.backend import ops
-from keras_cv.models.backbones.video_swin.video_swin_aliases import VideoSwinSBackbone
+from keras_cv.models.backbones.video_swin.video_swin_aliases import (
+    VideoSwinSBackbone,
+)
 from keras_cv.tests.test_case import TestCase
 
+
 class TestViTDetBackbone(TestCase):
     @pytest.mark.large
     def test_call(self):
diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py
index 384373c1f9..e4c6ca825a 100644
--- a/keras_cv/models/classification/video_classifier_presets.py
+++ b/keras_cv/models/classification/video_classifier_presets.py
@@ -16,7 +16,7 @@
 classifier_presets = {
     "videoswin_tiny_kinetics_classifier": {
         "metadata": {
-            "description": ("videoswin_tiny_kinetics "),  # TODO: update
+            "description": ("videoswin_tiny_kinetics "),
             "params": 25_613_800,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
@@ -24,16 +24,16 @@
     },
     "videoswin_small_kinetics_classifier": {
         "metadata": {
-            "description": ("videoswin_small_kinetics "),  # TODO: update
-            "params": 25_613_800,  # TODO: update
+            "description": ("videoswin_small_kinetics "),
+            "params": 25_613_800,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
     },
     "videoswin_base_kinetics_classifier": {
         "metadata": {
-            "description": ("videoswin_base_kinetics "),  # TODO: update
-            "params": 25_613_800,  # TODO: update
+            "description": ("videoswin_base_kinetics "),
+            "params": 25_613_800,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
@@ -41,9 +41,9 @@
     "videoswin_base_something_something_v2_classifier": {
         "metadata": {
             "description": (
-                "videoswin_base_something_something_v2 "  # TODO: update
+                "videoswin_base_something_something_v2 "
             ),
-            "params": 25_613_800,  # TODO: update
+            "params": 25_613_800,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },

From ffe457c63e77e371eaf4611e6e52130c610fb273 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 4 Mar 2024 15:52:38 +0600
Subject: [PATCH 43/94] disable preset test temporary

---
 .../backbones/video_swin/video_swin_backbone_presets_test.py | 5 +++++
 .../models/backbones/video_swin/video_swin_backbone_test.py  | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
index 80996fcbfa..1a3a5519c4 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
@@ -36,19 +36,23 @@ def setUp(self):
         self.input_batch = np.ones(shape=(1, 32, 224, 224, 3))
 
     def test_applications_model_output(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinBackbone()
         model(self.input_batch)
 
     def test_applications_model_output_with_preset(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinBackbone.from_preset("videoswin_tiny")
         model(self.input_batch)
 
     def test_applications_model_predict(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinTBackbone()
         model.predict(self.input_batch)
 
     def test_preset_docstring(self):
         """Check we did our docstring formatting correctly."""
+        self.skipTest("TODO: Enable after Kaggle model is public")
         for name in VideoSwinBackbone.presets:
             self.assertRegex(VideoSwinBackbone.from_preset.__doc__, name)
 
@@ -67,6 +71,7 @@ class VideoSwinPresetFullTest(TestCase):
     """
 
     def test_load_ViTDet(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         input_data = np.ones(shape=(1, 32, 224, 224, 3))
         for preset in VideoSwinBackbone.presets:
             model = VideoSwinBackbone.from_preset(preset)
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 9ad7fd32b6..4ab2787687 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -28,6 +28,7 @@
 class TestViTDetBackbone(TestCase):
     @pytest.mark.large
     def test_call(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinSBackbone()
         x = np.ones((1, 32, 224, 224, 3))
         x_out = ops.convert_to_numpy(model(x))
@@ -39,6 +40,7 @@ def test_call(self):
 
     @pytest.mark.extra_large
     def teat_save(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         # saving test
         model = VideoSwinSBackbone()
         x = np.ones((1, 32, 224, 224, 3))
@@ -51,6 +53,7 @@ def teat_save(self):
 
     @pytest.mark.extra_large
     def test_fit(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinSBackbone()
         x = np.ones((1, 32, 224, 224, 3))
         y = np.zeros((1, 16, 7, 7, 768))
@@ -58,6 +61,7 @@ def test_fit(self):
         model.fit(x, y, epochs=1)
 
     def test_pyramid_level_inputs_error(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinSBackbone()
         with self.assertRaises(NotImplementedError, msg="doesn't compute"):
             model.pyramid_level_inputs

From f8d3e26a4a5ebb3740beb6601bd7e310b0384685 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 4 Mar 2024 16:56:45 +0600
Subject: [PATCH 44/94] set include rescale to true

---
 keras_cv/models/backbones/video_swin/video_swin_aliases.py  | 6 +++---
 keras_cv/models/backbones/video_swin/video_swin_backbone.py | 4 ++--
 keras_cv/models/classification/video_classifier_presets.py  | 4 +---
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 56db9ca743..57ccf227dc 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -45,7 +45,7 @@
 class VideoSwinTBackbone(VideoSwinBackbone):
     def __new__(
         cls,
-        include_rescaling=False,
+        include_rescaling=True,
         **kwargs,
     ):
         kwargs.update(
@@ -74,7 +74,7 @@ def presets_with_weights(cls):
 class VideoSwinSBackbone(VideoSwinBackbone):
     def __new__(
         cls,
-        include_rescaling=False,
+        include_rescaling=True,
         **kwargs,
     ):
         kwargs.update(
@@ -103,7 +103,7 @@ def presets_with_weights(cls):
 class VideoSwinBBackbone(VideoSwinBackbone):
     def __new__(
         cls,
-        include_rescaling=False,
+        include_rescaling=True,
         **kwargs,
     ):
         kwargs.update(
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index a6c0868699..ac4e9a07ab 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -32,11 +32,11 @@ class VideoSwinBackbone(Backbone):
     """A Video Swin Transformer backbone model.
 
     Args:
-        input_shape (tuple[int], optional): The size of the input image in
+        input_shape (tuple[int], optional): The size of the input video in
             `(depth, height, width, channel)` format.
             Defaults to `(32, 224, 224, 3)`.
         input_tensor (KerasTensor, optional): Output of
-            `keras.layers.Input()`) to use as image input for the model.
+            `keras.layers.Input()`) to use as video input for the model.
             Defaults to `None`.
         include_rescaling (bool, optional): Whether to rescale the inputs. If
             set to `True`, inputs will be passed through a
diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py
index e4c6ca825a..9914e13433 100644
--- a/keras_cv/models/classification/video_classifier_presets.py
+++ b/keras_cv/models/classification/video_classifier_presets.py
@@ -40,9 +40,7 @@
     },
     "videoswin_base_something_something_v2_classifier": {
         "metadata": {
-            "description": (
-                "videoswin_base_something_something_v2 "
-            ),
+            "description": ("videoswin_base_something_something_v2 "),
             "params": 25_613_800,
             "official_name": "VideoClassifier",
             "path": "video_classifier",

From 1d0ad36a76a6cdc0a86b54a9e8c47946d563b783 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 4 Mar 2024 17:03:10 +0600
Subject: [PATCH 45/94] add video swin components to __init__

---
 keras_cv/models/__init__.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py
index 77c3ad33d9..0ef9b58d2d 100644
--- a/keras_cv/models/__init__.py
+++ b/keras_cv/models/__init__.py
@@ -179,11 +179,24 @@
     ResNetV2Backbone,
 )
 from keras_cv.models.backbones.vgg16.vgg16_backbone import VGG16Backbone
+from keras_cv.models.backbones.video_swin.video_swin_aliases import (
+    VideoSwinBBackbone,
+)
+from keras_cv.models.backbones.video_swin.video_swin_aliases import (
+    VideoSwinSBackbone,
+)
+from keras_cv.models.backbones.video_swin.video_swin_aliases import (
+    VideoSwinTBackbone,
+)
+from keras_cv.models.backbones.video_swin.video_swin_backbone import (
+    VideoSwinBackbone,
+)
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetBBackbone
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetHBackbone
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetLBackbone
 from keras_cv.models.backbones.vit_det.vit_det_backbone import ViTDetBackbone
 from keras_cv.models.classification.image_classifier import ImageClassifier
+from keras_cv.models.classification.video_classifier import VideoClassifier
 from keras_cv.models.feature_extractor.clip import CLIP
 from keras_cv.models.object_detection.retinanet.retinanet import RetinaNet
 from keras_cv.models.object_detection.yolo_v8.yolo_v8_backbone import (

From 838a50608f93f9a661a56e1a822bb16f78cedf56 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 12:08:14 +0600
Subject: [PATCH 46/94] update docstrings: video siwn layers scripts

---
 keras_cv/layers/video_swin_layers.py | 60 ++++++++++++++++++----------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 09766c95b2..d651e31e00 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -22,13 +22,17 @@
 
 
 def window_partition(x, window_size):
-    """
+    """Partitions the input tensor into windows of specified size.
+
     Args:
-        x: (batch_size, depth, height, width, channel)
-        window_size (tuple[int]): window size
+        x (Tensor): Input tensor of shape `(batch_size, depth, height, width, channel)`.
+        window_size (tuple[int]): Size of the window in each dimension (depth, height, width).
 
     Returns:
-        windows: (batch_size*num_windows, window_size*window_size, channel)
+        Tensor: Windows of shape `(batch_size*num_windows, window_size*window_size, channel)`,
+                where `num_windows = (
+                    depth//window_size[0]) * (height//window_size[1]) * (width//window_size[2]
+                )`.
     """  # noqa: E501
 
     input_shape = ops.shape(x)
@@ -63,15 +67,18 @@ def window_partition(x, window_size):
 
 
 def window_reverse(windows, window_size, batch_size, depth, height, width):
-    """
+    """Reconstructs the original tensor from windows of specified size.
+
     Args:
-        windows: (batch_size*num_windows, window_size, window_size, channel)
-        window_size (tuple[int]): Window size
-        height (int): Height of image
-        width (int): Width of image
+        windows (Tensor): Windows of shape `(batch_size*num_windows, window_size, window_size, channel)`.
+        window_size (tuple[int]): Size of the window in each dimension `(depth, height, width)`.
+        batch_size (int): Batch size.
+        depth (int): Depth of the original tensor.
+        height (int): Height of the original tensor.
+        width (int): Width of the original tensor.
 
     Returns:
-        x: (batch_size, depth, height, width, channel)
+        Tensor: Reconstructed tensor of shape `(batch_size, depth, height, width, channel)`.
     """  # noqa: E501
     x = ops.reshape(
         windows,
@@ -124,6 +131,26 @@ def get_window_size(x_size, window_size, shift_size=None):
 
 
 def compute_mask(depth, height, width, window_size, shift_size):
+    """Computes attention mask for sliding window self-attention mechanism.
+
+    Args:
+        depth (int): Depth of the input video.
+        height (int): Height of the input video.
+        width (int): Width of the input video.
+        window_size (tuple[int]): Size of the sliding window in each dimension (depth, height, width).
+        shift_size (tuple[int]): Size of the shifting step in each dimension (depth, height, width).
+
+    Returns:
+        Tensor: Attention mask of shape `(batch_size, num_windows, num_windows)`,
+                where `num_windows = (
+                    (depth - window_size[0]) // shift_size[0] + 1
+                    ) * (
+                    (height - window_size[1]) // shift_size[1] + 1
+                    ) * (
+                    (width - window_size[2]) // shift_size[2] + 1
+                    )`.
+
+    """
     img_mask = np.zeros((1, depth, height, width, 1))
     cnt = 0
     for d in (
@@ -292,7 +319,7 @@ def get_config(self):
 
 
 class VideoSwinPatchMerging(layers.Layer):
-    """Patch Merging Layer.
+    """Patch Merging Layer for Video Swin Model.
 
     Args:
         input_dim (int): Number of input channels.
@@ -325,12 +352,6 @@ def build(self, input_shape):
         self.built = True
 
     def call(self, x):
-        """The call function.
-
-        Args:
-            x: Input feature,
-            shape: (batch_size, depth, height, width, channel).
-        """
         input_shape = ops.shape(x)
         height, width = (
             input_shape[2],
@@ -755,10 +776,9 @@ def __init__(
             zip(self.shift_size, self.window_size)
         ):
             if not (0 <= shift < window):
-                # TODO: Add more description.
                 raise ValueError(
-                    f"shift_size[{i}] must be in the "
-                    "range 0 to window_size[{i}]"
+                    f"shift_size[{i}] must be in the range 0 to less than window_size[{i}], "
+                    f"but got shift_size[{i}]={shift} and window_size[{i}]={window}."
                 )
 
     def build(self, input_shape):

From b4f1534b944ca80bfcea44d6fd39300ddb7e8c81 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 12:10:33 +0600
Subject: [PATCH 47/94] update copywrite status: video siwn layers test scripts

---
 keras_cv/layers/video_swin_layers.py      | 2 +-
 keras_cv/layers/video_swin_layers_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index d651e31e00..eb733f27b8 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py
index 862c2e3a77..4e72cb5e9a 100644
--- a/keras_cv/layers/video_swin_layers_test.py
+++ b/keras_cv/layers/video_swin_layers_test.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 75c5b665e145761f0611ac7593f66f8f35f35992 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 12:26:45 +0600
Subject: [PATCH 48/94] update copywrite status: video siwn backbone scripts

---
 .../models/backbones/video_swin/__init__.py   |  2 +-
 .../video_swin/video_swin_aliases.py          |  2 +-
 .../video_swin/video_swin_backbone.py         | 24 +++++++++++--------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/__init__.py b/keras_cv/models/backbones/video_swin/__init__.py
index 3992ffb59a..0e9cbb5ac9 100644
--- a/keras_cv/models/backbones/video_swin/__init__.py
+++ b/keras_cv/models/backbones/video_swin/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 57ccf227dc..5044c2b469 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index ac4e9a07ab..1a6b5c5b69 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -39,28 +39,32 @@ class VideoSwinBackbone(Backbone):
             `keras.layers.Input()`) to use as video input for the model.
             Defaults to `None`.
         include_rescaling (bool, optional): Whether to rescale the inputs. If
-            set to `True`, inputs will be passed through a
-            `Rescaling(1/255.0)` layer and normalize with
-            mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225],
+            set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer 
+            and normalize with mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225].
             Defaults to `False`.
-        patch_size (int | tuple(int)): Patch size. Default: (2,4,4).
+        patch_size (int | tuple(int)): The patch size for depth, height, and width 
+            dimensions respectively. Default: (2,4,4).
         embed_dim (int): Number of linear projection output channels.
             Default to 96.
         depths (tuple[int]): Depths of each Swin Transformer stage.
             Default to [2, 2, 6, 2]
         num_heads (tuple[int]): Number of attention head of each stage.
             Default to [3, 6, 12, 24]
-        window_size (int): Window size. Default to [8, 7, 7].
+        window_size (int): The window size for depth, height, and width 
+            dimensions respectively. Default to [8, 7, 7].
         mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
             Default to 4.
         qkv_bias (bool): If True, add a learnable bias to query, key, value.
             Default to True.
         qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
             Default to None.
-        drop_rate (float): Dropout rate.
-        attn_drop_rate (float): Attention dropout rate. Default: 0.
-        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
-        patch_norm (bool): If True, add normalization after patch embedding.
+        drop_rate (float): Float between 0 and 1. Fraction of the input units to drop.
+            Default: 0.
+        attn_drop_rate (float): Float between 0 and 1. Attention dropout rate. 
+            Default: 0.
+        drop_path_rate (float): Float between 0 and 1. Stochastic depth rate. 
+            Default: 0.2.
+        patch_norm (bool): If True, add layer normalization after patch embedding.
             Default to False.
 
     References:

From 0b9808ba70e6ee83be8ef827a62c9a5a78d95a1f Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 15:23:22 +0600
Subject: [PATCH 49/94] bug fixes: video swin backbone layers

---
 keras_cv/layers/video_swin_layers.py          | 19 +++++++-------
 .../video_swin/video_swin_aliases.py          |  2 +-
 .../video_swin/video_swin_backbone.py         | 25 +++++++++++++++----
 .../video_swin/video_swin_backbone_test.py    |  5 ++--
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index eb733f27b8..bea5119e30 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -656,14 +656,13 @@ def build(self, input_shape):
         self.built = True
 
     def compute_output_shape(self, input_shape):
-        window_size, _ = get_window_size(
-            input_shape[1:-1], self.window_size, self.shift_size
-        )
-        depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
-        height_pad = self._compute_dim_padded(input_shape[2], window_size[1])
-        width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
-
         if self.downsample is not None:
+            window_size, _ = get_window_size(
+                input_shape[1:-1], self.window_size, self.shift_size
+            )
+            depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
+            height_pad = self._compute_dim_padded(input_shape[2], window_size[1])
+            width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
             output_shape = (
                 input_shape[0],
                 depth_pad,
@@ -677,7 +676,7 @@ def compute_output_shape(self, input_shape):
 
     def call(self, x, training=None):
         input_shape = ops.shape(x)
-        batch_size, depth, height, width, _ = (
+        batch_size, depth, height, width, channel = (
             input_shape[0],
             input_shape[1],
             input_shape[2],
@@ -688,7 +687,7 @@ def call(self, x, training=None):
         for block in self.blocks:
             x = block(x, self.attn_mask, training=training)
 
-        x = ops.reshape(x, [batch_size, depth, height, width, -1])
+        x = ops.reshape(x, [batch_size, depth, height, width, channel])
 
         if self.downsample is not None:
             x = self.downsample(x)
@@ -812,7 +811,7 @@ def build(self, input_shape):
         self.attn.build((None, None, self.input_dim))
 
         self.norm2 = self.norm_layer(axis=-1, epsilon=1e-05)
-        self.norm2.build((*input_shape[1:-1], self.input_dim))
+        self.norm2.build((*input_shape[:-1], self.input_dim))
 
         self.mlp = MLP(
             output_dim=self.input_dim,
diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 5044c2b469..7786804c90 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The KerasCV Authors
+# Copyright 202 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 1a6b5c5b69..7de6c4bb77 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import copy
 from functools import partial
 
 import numpy as np
@@ -26,6 +26,9 @@
 from keras_cv.models import utils
 from keras_cv.models.backbones.backbone import Backbone
 
+from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets # noqa: E501
+from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets_with_weights # noqa: E501
+from keras_cv.utils.python_utils import classproperty
 
 @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models")
 class VideoSwinBackbone(Backbone):
@@ -126,9 +129,10 @@ def __init__(
             # std=[58.395, 57.12, 57.375] for normalization.
             # So, if include_rescaling is set to True, then, to match with the
             # official scores, following normalization should be added.
-            x = (x - ops.array([0.485, 0.456, 0.406], dtype=x.dtype)) / (
-                ops.array([0.229, 0.224, 0.225], dtype=x.dtype)
-            )
+            x = layers.Normalization(
+                mean=[0.485, 0.456, 0.406],
+                variance=[0.229 ** 2, 0.224 ** 2, 0.225 ** 2]
+            )(x)
 
         norm_layer = partial(layers.LayerNormalization, epsilon=1e-05)
 
@@ -162,7 +166,7 @@ def __init__(
             )
             x = layer(x)
 
-        x = norm_layer(axis=-1, epsilon=1e-05, name="norm")(x)
+        x = norm_layer(axis=-1, epsilon=1e-05, name="videoswin_top_norm")(x)
         super().__init__(inputs=input_spec, outputs=x, **kwargs)
 
         self.embed_dim = embed_dim
@@ -199,3 +203,14 @@ def get_config(self):
             }
         )
         return config
+    
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return copy.deepcopy(backbone_presets)
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return copy.deepcopy(backbone_presets_with_weights)
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 4ab2787687..50c100cb7f 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,6 @@
 class TestViTDetBackbone(TestCase):
     @pytest.mark.large
     def test_call(self):
-        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinSBackbone()
         x = np.ones((1, 32, 224, 224, 3))
         x_out = ops.convert_to_numpy(model(x))
@@ -36,7 +35,7 @@ def test_call(self):
             np.prod(tuple(x.shape)) for x in model.trainable_variables
         )
         self.assertEqual(x_out.shape, (1, 16, 7, 7, 768))
-        self.assertEqual(num_parameters, 49_509_078)
+        self.assertEqual(num_parameters, 27_850_470)
 
     @pytest.mark.extra_large
     def teat_save(self):

From 0a4e2cb5baf38b2a822a05989fdb1c19b72207b8 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 15:31:51 +0600
Subject: [PATCH 50/94] update get config of video swin backbone

---
 keras_cv/models/backbones/video_swin/video_swin_backbone.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 7de6c4bb77..33c11e877c 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -169,6 +169,8 @@ def __init__(
         x = norm_layer(axis=-1, epsilon=1e-05, name="videoswin_top_norm")(x)
         super().__init__(inputs=input_spec, outputs=x, **kwargs)
 
+        self.include_rescaling = include_rescaling
+        self.input_tensor = input_tensor
         self.embed_dim = embed_dim
         self.patch_size = patch_size
         self.window_size = window_size
@@ -188,6 +190,9 @@ def get_config(self):
         config = super().get_config()
         config.update(
             {
+                "include_rescaling": self.include_rescaling,
+                "input_shape": self.input_shape[1:],
+                "input_tensor": self.input_tensor,
                 "embed_dim": self.embed_dim,
                 "patch_norm": self.patch_norm,
                 "window_size": self.window_size,

From fb732d0e9cd2bcb3a97ffc64c0cf196e1a3c79c2 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 15:36:12 +0600
Subject: [PATCH 51/94] enable: video swin backbone test cases

---
 .../models/backbones/video_swin/video_swin_backbone_test.py    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 50c100cb7f..cdf1160bf9 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -39,7 +39,6 @@ def test_call(self):
 
     @pytest.mark.extra_large
     def teat_save(self):
-        self.skipTest("TODO: Enable after Kaggle model is public")
         # saving test
         model = VideoSwinSBackbone()
         x = np.ones((1, 32, 224, 224, 3))
@@ -52,7 +51,6 @@ def teat_save(self):
 
     @pytest.mark.extra_large
     def test_fit(self):
-        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinSBackbone()
         x = np.ones((1, 32, 224, 224, 3))
         y = np.zeros((1, 16, 7, 7, 768))
@@ -60,7 +58,6 @@ def test_fit(self):
         model.fit(x, y, epochs=1)
 
     def test_pyramid_level_inputs_error(self):
-        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinSBackbone()
         with self.assertRaises(NotImplementedError, msg="doesn't compute"):
             model.pyramid_level_inputs

From 44433354d643906e066f1b62283014a7ddb8d66e Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 15:48:12 +0600
Subject: [PATCH 52/94] update: video swin backbone test cases

---
 .../video_swin/video_swin_backbone_test.py      | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index cdf1160bf9..201cd4de22 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -26,21 +26,24 @@
 
 
 class TestViTDetBackbone(TestCase):
+
     @pytest.mark.large
     def test_call(self):
-        model = VideoSwinSBackbone()
-        x = np.ones((1, 32, 224, 224, 3))
+        model = VideoSwinSBackbone(
+            include_rescaling=True, input_shape=(8,256,256,3)
+        )
+        x = np.ones((1, 8, 256, 256, 3))
         x_out = ops.convert_to_numpy(model(x))
         num_parameters = sum(
             np.prod(tuple(x.shape)) for x in model.trainable_variables
         )
-        self.assertEqual(x_out.shape, (1, 16, 7, 7, 768))
-        self.assertEqual(num_parameters, 27_850_470)
+        self.assertEqual(x_out.shape, (1, 4, 8, 8, 768))
+        self.assertEqual(num_parameters, 27_663_894)
 
     @pytest.mark.extra_large
     def teat_save(self):
         # saving test
-        model = VideoSwinSBackbone()
+        model = VideoSwinSBackbone(include_rescaling=False)
         x = np.ones((1, 32, 224, 224, 3))
         x_out = ops.convert_to_numpy(model(x))
         path = os.path.join(self.get_temp_dir(), "model.keras")
@@ -51,13 +54,13 @@ def teat_save(self):
 
     @pytest.mark.extra_large
     def test_fit(self):
-        model = VideoSwinSBackbone()
+        model = VideoSwinSBackbone(include_rescaling=False)
         x = np.ones((1, 32, 224, 224, 3))
         y = np.zeros((1, 16, 7, 7, 768))
         model.compile(optimizer="adam", loss="mse", metrics=["mse"])
         model.fit(x, y, epochs=1)
 
     def test_pyramid_level_inputs_error(self):
-        model = VideoSwinSBackbone()
+        model = VideoSwinSBackbone(include_rescaling=False)
         with self.assertRaises(NotImplementedError, msg="doesn't compute"):
             model.pyramid_level_inputs

From f3411cbefb3121f7bd0ea49a7156fe2c3f183d9c Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 15:50:23 +0600
Subject: [PATCH 53/94] update: video swin backbone preset test cases

---
 .../backbones/video_swin/video_swin_backbone_presets_test.py     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
index 1a3a5519c4..c8abba5c11 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
@@ -36,7 +36,6 @@ def setUp(self):
         self.input_batch = np.ones(shape=(1, 32, 224, 224, 3))
 
     def test_applications_model_output(self):
-        self.skipTest("TODO: Enable after Kaggle model is public")
         model = VideoSwinBackbone()
         model(self.input_batch)
 

From 00c67ba4adf6cbb8fbe924df901851ffa33c67b7 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 16:04:12 +0600
Subject: [PATCH 54/94] run formatters

---
 keras_cv/layers/video_swin_layers.py          | 12 ++++---
 .../video_swin/video_swin_backbone.py         | 35 +++++++++++++------
 .../video_swin/video_swin_backbone_test.py    |  2 +-
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index bea5119e30..63d3b1f464 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -150,7 +150,8 @@ def compute_mask(depth, height, width, window_size, shift_size):
                     (width - window_size[2]) // shift_size[2] + 1
                     )`.
 
-    """
+    """  # noqa: E501
+
     img_mask = np.zeros((1, depth, height, width, 1))
     cnt = 0
     for d in (
@@ -661,7 +662,9 @@ def compute_output_shape(self, input_shape):
                 input_shape[1:-1], self.window_size, self.shift_size
             )
             depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
-            height_pad = self._compute_dim_padded(input_shape[2], window_size[1])
+            height_pad = self._compute_dim_padded(
+                input_shape[2], window_size[1]
+            )
             width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
             output_shape = (
                 input_shape[0],
@@ -776,8 +779,9 @@ def __init__(
         ):
             if not (0 <= shift < window):
                 raise ValueError(
-                    f"shift_size[{i}] must be in the range 0 to less than window_size[{i}], "
-                    f"but got shift_size[{i}]={shift} and window_size[{i}]={window}."
+                    f"shift_size[{i}] must be in the range 0 to less than "
+                    f"window_size[{i}], but got shift_size[{i}]={shift} "
+                    f"and window_size[{i}]={window}."
                 )
 
     def build(self, input_shape):
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 33c11e877c..5a19a98372 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -19,17 +19,20 @@
 
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
-from keras_cv.backend import ops
 from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
 from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
 from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
 from keras_cv.models import utils
 from keras_cv.models.backbones.backbone import Backbone
-
-from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets # noqa: E501
-from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import backbone_presets_with_weights # noqa: E501
+from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import (  # noqa: E501
+    backbone_presets,
+)
+from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import (  # noqa: E501
+    backbone_presets_with_weights,
+)
 from keras_cv.utils.python_utils import classproperty
 
+
 @keras_cv_export("keras_cv.models.VideoSwinBackbone", package="keras_cv.models")
 class VideoSwinBackbone(Backbone):
     """A Video Swin Transformer backbone model.
@@ -42,10 +45,10 @@ class VideoSwinBackbone(Backbone):
             `keras.layers.Input()`) to use as video input for the model.
             Defaults to `None`.
         include_rescaling (bool, optional): Whether to rescale the inputs. If
-            set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer 
+            set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer
             and normalize with mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225].
             Defaults to `False`.
-        patch_size (int | tuple(int)): The patch size for depth, height, and width 
+        patch_size (int | tuple(int)): The patch size for depth, height, and width
             dimensions respectively. Default: (2,4,4).
         embed_dim (int): Number of linear projection output channels.
             Default to 96.
@@ -53,7 +56,7 @@ class VideoSwinBackbone(Backbone):
             Default to [2, 2, 6, 2]
         num_heads (tuple[int]): Number of attention head of each stage.
             Default to [3, 6, 12, 24]
-        window_size (int): The window size for depth, height, and width 
+        window_size (int): The window size for depth, height, and width
             dimensions respectively. Default to [8, 7, 7].
         mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
             Default to 4.
@@ -63,13 +66,23 @@ class VideoSwinBackbone(Backbone):
             Default to None.
         drop_rate (float): Float between 0 and 1. Fraction of the input units to drop.
             Default: 0.
-        attn_drop_rate (float): Float between 0 and 1. Attention dropout rate. 
+        attn_drop_rate (float): Float between 0 and 1. Attention dropout rate.
             Default: 0.
-        drop_path_rate (float): Float between 0 and 1. Stochastic depth rate. 
+        drop_path_rate (float): Float between 0 and 1. Stochastic depth rate.
             Default: 0.2.
         patch_norm (bool): If True, add layer normalization after patch embedding.
             Default to False.
 
+    Example:
+    ```python
+    # Build video swin backbone without top layer
+    model = VideoSwinSBackbone(
+        include_rescaling=True, input_shape=(8, 256, 256, 3),
+    )
+    videos = tf.ones((1, 8, 256, 256, 3))
+    outputs = model.predict(videos)
+    ```
+
     References:
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
         - [Official Code](https://github.com/SwinTransformer/Video-Swin-Transformer)
@@ -131,7 +144,7 @@ def __init__(
             # official scores, following normalization should be added.
             x = layers.Normalization(
                 mean=[0.485, 0.456, 0.406],
-                variance=[0.229 ** 2, 0.224 ** 2, 0.225 ** 2]
+                variance=[0.229**2, 0.224**2, 0.225**2],
             )(x)
 
         norm_layer = partial(layers.LayerNormalization, epsilon=1e-05)
@@ -208,7 +221,7 @@ def get_config(self):
             }
         )
         return config
-    
+
     @classproperty
     def presets(cls):
         """Dictionary of preset names and configurations."""
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 201cd4de22..b711c9f554 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -30,7 +30,7 @@ class TestViTDetBackbone(TestCase):
     @pytest.mark.large
     def test_call(self):
         model = VideoSwinSBackbone(
-            include_rescaling=True, input_shape=(8,256,256,3)
+            include_rescaling=True, input_shape=(8, 256, 256, 3)
         )
         x = np.ones((1, 8, 256, 256, 3))
         x_out = ops.convert_to_numpy(model(x))

From 9d3ab2ed1908bee977e9047d6f89298efebacc41 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 16:16:45 +0600
Subject: [PATCH 55/94] fix typos: video swin backbone test cases

---
 .../video_swin/video_swin_backbone_test.py         | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index b711c9f554..7af9c0b7b5 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -19,17 +19,17 @@
 
 from keras_cv.backend import keras
 from keras_cv.backend import ops
-from keras_cv.models.backbones.video_swin.video_swin_aliases import (
-    VideoSwinSBackbone,
+from keras_cv.models.backbones.video_swin.video_swin_backbone import (
+    VideoSwinBackbone,
 )
 from keras_cv.tests.test_case import TestCase
 
 
-class TestViTDetBackbone(TestCase):
+class TestVideoSwinSBackbone(TestCase):
 
     @pytest.mark.large
     def test_call(self):
-        model = VideoSwinSBackbone(
+        model = VideoSwinBackbone(  # TODO: replace with aliases
             include_rescaling=True, input_shape=(8, 256, 256, 3)
         )
         x = np.ones((1, 8, 256, 256, 3))
@@ -43,7 +43,7 @@ def test_call(self):
     @pytest.mark.extra_large
     def teat_save(self):
         # saving test
-        model = VideoSwinSBackbone(include_rescaling=False)
+        model = VideoSwinBackbone(include_rescaling=False)
         x = np.ones((1, 32, 224, 224, 3))
         x_out = ops.convert_to_numpy(model(x))
         path = os.path.join(self.get_temp_dir(), "model.keras")
@@ -54,13 +54,13 @@ def teat_save(self):
 
     @pytest.mark.extra_large
     def test_fit(self):
-        model = VideoSwinSBackbone(include_rescaling=False)
+        model = VideoSwinBackbone(include_rescaling=False)
         x = np.ones((1, 32, 224, 224, 3))
         y = np.zeros((1, 16, 7, 7, 768))
         model.compile(optimizer="adam", loss="mse", metrics=["mse"])
         model.fit(x, y, epochs=1)
 
     def test_pyramid_level_inputs_error(self):
-        model = VideoSwinSBackbone(include_rescaling=False)
+        model = VideoSwinBackbone(include_rescaling=False)
         with self.assertRaises(NotImplementedError, msg="doesn't compute"):
             model.pyramid_level_inputs

From 5bdc8b45d46beb030a89784c4f092536af055f7b Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 16:44:21 +0600
Subject: [PATCH 56/94] add: non implemented property for test reason

---
 .../models/backbones/video_swin/video_swin_backbone.py     | 7 +++++++
 .../backbones/video_swin/video_swin_backbone_test.py       | 5 -----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 5a19a98372..0488758372 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -232,3 +232,10 @@ def presets_with_weights(cls):
         """Dictionary of preset names and configurations that include
         weights."""
         return copy.deepcopy(backbone_presets_with_weights)
+
+    @property
+    def pyramid_level_inputs(self):
+        raise NotImplementedError(
+            "The `ViTDetBackbone` model doesn't compute"
+            " pyramid level features."
+        )
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 7af9c0b7b5..9adc543626 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -59,8 +59,3 @@ def test_fit(self):
         y = np.zeros((1, 16, 7, 7, 768))
         model.compile(optimizer="adam", loss="mse", metrics=["mse"])
         model.fit(x, y, epochs=1)
-
-    def test_pyramid_level_inputs_error(self):
-        model = VideoSwinBackbone(include_rescaling=False)
-        with self.assertRaises(NotImplementedError, msg="doesn't compute"):
-            model.pyramid_level_inputs

From cb5da28dbe77c1c7d213af10b58ddec09540ad18 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 5 Mar 2024 16:46:47 +0600
Subject: [PATCH 57/94] fix: typos

---
 keras_cv/models/backbones/video_swin/video_swin_backbone.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 0488758372..07008bf0c8 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -236,6 +236,6 @@ def presets_with_weights(cls):
     @property
     def pyramid_level_inputs(self):
         raise NotImplementedError(
-            "The `ViTDetBackbone` model doesn't compute"
+            "The `VideoSwinBackbone` model doesn't compute"
             " pyramid level features."
         )

From 82a84979737c3a4c38a420bbb7b8e7cbed326413 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 13:24:09 +0600
Subject: [PATCH 58/94] add: video classifier test

---
 .../classification/video_classifier_test.py   | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 keras_cv/models/classification/video_classifier_test.py

diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
new file mode 100644
index 0000000000..c506c993c9
--- /dev/null
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -0,0 +1,98 @@
+# Copyright 2024 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for VideoClassifier."""
+
+
+import os
+
+import numpy as np
+import pytest
+import tensorflow as tf
+from absl.testing import parameterized
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models.backbones.video_swin.video_swin_backbone import (
+    VideoSwinBackbone,  # TODO: update with aliases
+)
+from keras_cv.models.classification.video_classifier import VideoClassifier
+from keras_cv.tests.test_case import TestCase
+
+
+class VideoClassifierTest(TestCase):
+    def setUp(self):
+        self.input_batch = np.ones(shape=(2, 8, 224, 224, 3))
+        self.dataset = tf.data.Dataset.from_tensor_slices(
+            (self.input_batch, tf.one_hot(tf.ones((10,), dtype="int32"), 2))
+        ).batch(4)
+
+    def test_valid_call(self):
+        model = VideoClassifier(
+            backbone=VideoSwinBackbone(include_rescaling=False),
+            num_classes=10,
+        )
+        model(self.input_batch)
+
+    @parameterized.named_parameters(
+        ("jit_compile_false", False), ("jit_compile_true", True)
+    )
+    @pytest.mark.large  # Fit is slow, so mark these large.
+    def test_classifier_fit(self, jit_compile):
+        model = VideoClassifier(
+            backbone=VideoSwinBackbone(include_rescaling=False),
+            num_classes=10,
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="adam",
+            metrics=["accuracy"],
+            jit_compile=jit_compile,
+        )
+        model.fit(self.dataset)
+
+    @parameterized.named_parameters(
+        ("avg_pooling", "avg"), ("max_pooling", "max")
+    )
+    def test_pooling_arg_call(self, pooling):
+        model = VideoClassifier(
+            backbone=VideoSwinBackbone(include_rescaling=False),
+            num_classes=10,
+            pooling=pooling,
+        )
+        model(self.input_batch)
+
+    @pytest.mark.large  # Saving is slow, so mark these large.
+    def test_saved_model(self):
+        model = VideoClassifier(
+            backbone=VideoSwinBackbone(include_rescaling=False),
+            num_classes=2,
+        )
+        model_output = model(self.input_batch)
+        save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras")
+        model.save(save_path)
+        restored_model = keras.models.load_model(save_path)
+
+        # Check we got the real object back.
+        self.assertIsInstance(restored_model, VideoClassifier)
+
+        # Check that output matches.
+        restored_output = restored_model(self.input_batch)
+        self.assertAllClose(
+            ops.convert_to_numpy(model_output),
+            ops.convert_to_numpy(restored_output),
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()

From e2f5056412ce9ea0dd42cde95a2f83a725c0db80 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 14:19:18 +0600
Subject: [PATCH 59/94] update: video classifier test

---
 .../classification/video_classifier_test.py   | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
index c506c993c9..c647ba5480 100644
--- a/keras_cv/models/classification/video_classifier_test.py
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -32,17 +32,19 @@
 
 class VideoClassifierTest(TestCase):
     def setUp(self):
-        self.input_batch = np.ones(shape=(2, 8, 224, 224, 3))
+        self.input_batch = np.ones(shape=(10, 8, 224, 224, 3))
         self.dataset = tf.data.Dataset.from_tensor_slices(
-            (self.input_batch, tf.one_hot(tf.ones((10,), dtype="int32"), 2))
+            (self.input_batch, tf.one_hot(tf.ones((10,), dtype="int32"), 10))
         ).batch(4)
 
     def test_valid_call(self):
         model = VideoClassifier(
-            backbone=VideoSwinBackbone(include_rescaling=False),
+            backbone=VideoSwinBackbone(
+                input_shape=(8, 224, 224, 3), include_rescaling=False
+            ),
             num_classes=10,
         )
-        model(self.input_batch)
+        model.predict(self.input_batch)
 
     @parameterized.named_parameters(
         ("jit_compile_false", False), ("jit_compile_true", True)
@@ -50,7 +52,9 @@ def test_valid_call(self):
     @pytest.mark.large  # Fit is slow, so mark these large.
     def test_classifier_fit(self, jit_compile):
         model = VideoClassifier(
-            backbone=VideoSwinBackbone(include_rescaling=False),
+            backbone=VideoSwinBackbone(
+                input_shape=(8, 224, 224, 3), include_rescaling=True
+            ),
             num_classes=10,
         )
         model.compile(
@@ -70,15 +74,17 @@ def test_pooling_arg_call(self, pooling):
             num_classes=10,
             pooling=pooling,
         )
-        model(self.input_batch)
+        model.predict(self.input_batch)
 
     @pytest.mark.large  # Saving is slow, so mark these large.
     def test_saved_model(self):
         model = VideoClassifier(
-            backbone=VideoSwinBackbone(include_rescaling=False),
-            num_classes=2,
+            backbone=VideoSwinBackbone(
+                input_shape=(8, 224, 224, 3), include_rescaling=False
+            ),
+            num_classes=10,
         )
-        model_output = model(self.input_batch)
+        model_output = model.predict(self.input_batch)
         save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras")
         model.save(save_path)
         restored_model = keras.models.load_model(save_path)
@@ -87,7 +93,7 @@ def test_saved_model(self):
         self.assertIsInstance(restored_model, VideoClassifier)
 
         # Check that output matches.
-        restored_output = restored_model(self.input_batch)
+        restored_output = restored_model.predict(self.input_batch)
         self.assertAllClose(
             ops.convert_to_numpy(model_output),
             ops.convert_to_numpy(restored_output),

From 146f32fabe1e3308062d711d4df291d6aa2955a5 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 14:41:59 +0600
Subject: [PATCH 60/94] update: video classifier test input shape

---
 keras_cv/models/classification/video_classifier_test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
index c647ba5480..6e8b494ac4 100644
--- a/keras_cv/models/classification/video_classifier_test.py
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -70,7 +70,9 @@ def test_classifier_fit(self, jit_compile):
     )
     def test_pooling_arg_call(self, pooling):
         model = VideoClassifier(
-            backbone=VideoSwinBackbone(include_rescaling=False),
+            backbone=VideoSwinBackbone(
+                input_shape=(8, 224, 224, 3), include_rescaling=True
+            ),
             num_classes=10,
             pooling=pooling,
         )

From d25746bd5e4fde42915832119a1f2877a620d164 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 15:06:04 +0600
Subject: [PATCH 61/94] bug fix: mlp layer build method

---
 keras_cv/layers/video_swin_layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 63d3b1f464..5eda8dd66f 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -212,7 +212,7 @@ def __init__(
 
     def build(self, input_shape):
         self.fc1.build(input_shape)
-        self.fc2.build((*input_shape[1:-1], self.hidden_dim))
+        self.fc2.build((*input_shape[:-1], self.hidden_dim))
         self.built = True
 
     def call(self, x, training=None):
@@ -823,7 +823,7 @@ def build(self, input_shape):
             activation=self._activation_identifier,
             drop_rate=self.drop_rate,
         )
-        self.mlp.build((*input_shape[1:-1], self.input_dim))
+        self.mlp.build((*input_shape[:-1], self.input_dim))
         self.built = True
 
     def first_forward(self, x, mask_matrix, training):

From 9779ad44130403bd3be5538d72a55f79036fb8f9 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 15:13:43 +0600
Subject: [PATCH 62/94] updated: swin back layer build method

---
 keras_cv/layers/video_swin_layers.py | 34 +++++++++++++++-------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 5eda8dd66f..429a22529b 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -613,14 +613,24 @@ def _compute_dim_padded(self, input_dim, window_dim_size):
         )
 
     def build(self, input_shape):
-        window_size, shift_size = get_window_size(
+        self.window_size, self.shift_size = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
         )
-        depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
-        height_pad = self._compute_dim_padded(input_shape[2], window_size[1])
-        width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
+        self.depth_pad = self._compute_dim_padded(
+            input_shape[1], self.window_size[0]
+        )
+        self.height_pad = self._compute_dim_padded(
+            input_shape[2], self.window_size[1]
+        )
+        self.width_pad = self._compute_dim_padded(
+            input_shape[3], self.window_size[2]
+        )
         self.attn_mask = compute_mask(
-            depth_pad, height_pad, width_pad, window_size, shift_size
+            self.depth_pad,
+            self.height_pad,
+            self.width_pad,
+            self.window_size,
+            self.shift_size,
         )
 
         # build blocks
@@ -658,19 +668,11 @@ def build(self, input_shape):
 
     def compute_output_shape(self, input_shape):
         if self.downsample is not None:
-            window_size, _ = get_window_size(
-                input_shape[1:-1], self.window_size, self.shift_size
-            )
-            depth_pad = self._compute_dim_padded(input_shape[1], window_size[0])
-            height_pad = self._compute_dim_padded(
-                input_shape[2], window_size[1]
-            )
-            width_pad = self._compute_dim_padded(input_shape[3], window_size[2])
             output_shape = (
                 input_shape[0],
-                depth_pad,
-                height_pad // 2,
-                width_pad // 2,
+                self.depth_pad,
+                self.height_pad // 2,
+                self.width_pad // 2,
                 2 * self.input_dim,
             )
             return output_shape

From 7fa3f83ea03f38e52619450f82e5660b509eb333 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 17:24:02 +0600
Subject: [PATCH 63/94] bug fix: use tf.TensorShape in compute_output_shape
 method

---
 keras_cv/layers/video_swin_layers.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 429a22529b..9ce3375ff5 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import tensorflow as tf
 from keras import layers
 
 from keras_cv.api_export import keras_cv_export
@@ -668,12 +669,16 @@ def build(self, input_shape):
 
     def compute_output_shape(self, input_shape):
         if self.downsample is not None:
-            output_shape = (
-                input_shape[0],
-                self.depth_pad,
-                self.height_pad // 2,
-                self.width_pad // 2,
-                2 * self.input_dim,
+            # TODO: remove tensorflow dependencies.
+            # GitHub issue: fix https://github.com/keras-team/keras/issues/19259 # noqa: E501
+            output_shape = tf.TensorShape(
+                [
+                    input_shape[0],
+                    self.depth_pad,
+                    self.height_pad // 2,
+                    self.width_pad // 2,
+                    2 * self.input_dim,
+                ]
             )
             return output_shape
 

From c8aea501528924789e8bbc0916a0d179c17032c1 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 19:19:56 +0600
Subject: [PATCH 64/94] update: video_classifier_test model.predict to
 model.call

---
 keras_cv/models/classification/video_classifier_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
index 6e8b494ac4..2fcd0d5ae0 100644
--- a/keras_cv/models/classification/video_classifier_test.py
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -44,7 +44,7 @@ def test_valid_call(self):
             ),
             num_classes=10,
         )
-        model.predict(self.input_batch)
+        model(self.input_batch)
 
     @parameterized.named_parameters(
         ("jit_compile_false", False), ("jit_compile_true", True)
@@ -76,7 +76,7 @@ def test_pooling_arg_call(self, pooling):
             num_classes=10,
             pooling=pooling,
         )
-        model.predict(self.input_batch)
+        model(self.input_batch)
 
     @pytest.mark.large  # Saving is slow, so mark these large.
     def test_saved_model(self):

From 8287395e4b89ca7479afc2a9ffbede1a69c94a8b Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 6 Mar 2024 19:22:04 +0600
Subject: [PATCH 65/94] update test cases and format the code

---
 keras_cv/models/classification/video_classifier_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
index 2fcd0d5ae0..ac188e292c 100644
--- a/keras_cv/models/classification/video_classifier_test.py
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -86,7 +86,7 @@ def test_saved_model(self):
             ),
             num_classes=10,
         )
-        model_output = model.predict(self.input_batch)
+        model_output = model(self.input_batch)
         save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras")
         model.save(save_path)
         restored_model = keras.models.load_model(save_path)
@@ -95,7 +95,7 @@ def test_saved_model(self):
         self.assertIsInstance(restored_model, VideoClassifier)
 
         # Check that output matches.
-        restored_output = restored_model.predict(self.input_batch)
+        restored_output = restored_model(self.input_batch)
         self.assertAllClose(
             ops.convert_to_numpy(model_output),
             ops.convert_to_numpy(restored_output),

From e9a39978e719081d820822b06d684df23276e96c Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sat, 9 Mar 2024 18:48:01 +0600
Subject: [PATCH 66/94] update docstrings and preset config

---
 keras_cv/layers/video_swin_layers.py          | 154 ++++++++++++------
 .../video_swin/video_swin_aliases.py          |  10 ++
 .../video_swin/video_swin_backbone.py         |   2 +-
 .../video_swin/video_swin_backbone_presets.py |  39 ++++-
 .../models/classification/video_classifier.py |  12 +-
 .../video_classifier_presets.py               |  80 +++++++--
 .../classification/video_classifier_test.py   |   2 +-
 7 files changed, 222 insertions(+), 77 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 9ce3375ff5..98869a76f2 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -23,17 +23,22 @@
 
 
 def window_partition(x, window_size):
-    """Partitions the input tensor into windows of specified size.
+    """Partitions a video tensor into non-overlapping windows of a specified size.
 
     Args:
-        x (Tensor): Input tensor of shape `(batch_size, depth, height, width, channel)`.
-        window_size (tuple[int]): Size of the window in each dimension (depth, height, width).
+        x: A tensor with shape (B, D, H, W, C), where:
+            - B: Batch size
+            - D: Number of frames (depth) in the video
+            - H: Height of the video frames
+            - W: Width of the video frames
+            - C: Number of channels in the video (e.g., RGB for color)
+        window_size: A tuple of ints of size 3 representing the window size
+            along each dimension (depth, height, width).
 
     Returns:
-        Tensor: Windows of shape `(batch_size*num_windows, window_size*window_size, channel)`,
-                where `num_windows = (
-                    depth//window_size[0]) * (height//window_size[1]) * (width//window_size[2]
-                )`.
+        A tensor with shape (num_windows * B, window_size[0], window_size[1], window_size[2], C),
+        where each window from the video is a sub-tensor containing the specified
+        number of frames and the corresponding spatial window.
     """  # noqa: E501
 
     input_shape = ops.shape(x)
@@ -68,18 +73,26 @@ def window_partition(x, window_size):
 
 
 def window_reverse(windows, window_size, batch_size, depth, height, width):
-    """Reconstructs the original tensor from windows of specified size.
+    """Reconstructs the original video tensor from its partitioned windows.
+
+    This function assumes the windows were created using the `window_partition` function
+    with the same `window_size`.
 
     Args:
-        windows (Tensor): Windows of shape `(batch_size*num_windows, window_size, window_size, channel)`.
-        window_size (tuple[int]): Size of the window in each dimension `(depth, height, width)`.
-        batch_size (int): Batch size.
-        depth (int): Depth of the original tensor.
-        height (int): Height of the original tensor.
-        width (int): Width of the original tensor.
+        windows: A tensor with shape (num_windows * batch_size, window_size[0],
+            window_size[1], window_size[2], channels), where:
+            - num_windows: Number of windows created during partitioning
+            - channels: Number of channels in the video (same as in `window_partition`)
+        window_size: A tuple of ints of size 3 representing the window size used
+            during partitioning (same as in `window_partition`).
+        batch_size: Batch size of the original video tensor (same as in `window_partition`).
+        depth: Number of frames (depth) in the original video tensor (same as in `window_partition`).
+        height: Height of the video frames in the original tensor (same as in `window_partition`).
+        width: Width of the video frames in the original tensor (same as in `window_partition`).
 
     Returns:
-        Tensor: Reconstructed tensor of shape `(batch_size, depth, height, width, channel)`.
+        A tensor with shape (batch_size, depth, height, width, channels), representing the
+        original video reconstructed from the provided windows.
     """  # noqa: E501
     x = ops.reshape(
         windows,
@@ -100,18 +113,30 @@ def window_reverse(windows, window_size, batch_size, depth, height, width):
 
 
 def get_window_size(x_size, window_size, shift_size=None):
-    """Computing window size based on: "Liu et al.,
-    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
-    <https://arxiv.org/abs/2103.14030>"
-    https://github.com/microsoft/Swin-Transformer
+    """Computes the appropriate window size and potentially shift size for Swin Transformer.
+
+    This function implements the logic from the Swin Transformer paper by Ze Liu et al.
+    (https://arxiv.org/abs/2103.14030) to determine suitable window sizes
+    based on the input size and the provided base window size.
 
     Args:
-        x_size: input size.
-        window_size: local window size.
-        shift_size: window shifting size.
+        x_size: A tuple of ints of size 3 representing the input size (depth, height, width)
+            of the data (e.g., video).
+        window_size: A tuple of ints of size 3 representing the base window size
+            (depth, height, width) to use for partitioning.
+        shift_size: A tuple of ints of size 3 (optional) representing the window
+            shifting size (depth, height, width) for shifted window processing
+            used in Swin Transformer. If not provided, only window size is computed.
 
     Returns:
-        x: window_size, shift_size
+        A tuple or a pair of tuples:
+            - If `shift_size` is None, returns a single tuple representing the adjusted
+            window size that may be smaller than the provided `window_size` to ensure
+            it doesn't exceed the input size along any dimension.
+            - If `shift_size` is provided, returns a pair of tuples. The first tuple
+            represents the adjusted window size, and the second tuple represents the
+            adjusted shift size. The adjustments ensure both window size and shift size
+            do not exceed the corresponding dimensions in the input data.
     """  # noqa: E501
 
     use_window_size = list(window_size)
@@ -132,25 +157,33 @@ def get_window_size(x_size, window_size, shift_size=None):
 
 
 def compute_mask(depth, height, width, window_size, shift_size):
-    """Computes attention mask for sliding window self-attention mechanism.
+    """Computes an attention mask for a sliding window self-attention mechanism
+    used in Video Swin Transformer.
+
+    This function creates a mask to indicate which windows can attend to each other
+    during the self-attention operation. It considers non-overlapping and potentially
+    shifted windows based on the provided window size and shift size.
 
     Args:
-        depth (int): Depth of the input video.
-        height (int): Height of the input video.
-        width (int): Width of the input video.
-        window_size (tuple[int]): Size of the sliding window in each dimension (depth, height, width).
-        shift_size (tuple[int]): Size of the shifting step in each dimension (depth, height, width).
+        depth (int): Depth (number of frames) of the input video.
+        height (int): Height of the video frames.
+        width (int): Width of the video frames.
+        window_size (tuple[int]): Size of the sliding window in each dimension
+            (depth, height, width).
+        shift_size (tuple[int]): Size of the shifting step in each dimension
+            (depth, height, width).
 
     Returns:
-        Tensor: Attention mask of shape `(batch_size, num_windows, num_windows)`,
-                where `num_windows = (
-                    (depth - window_size[0]) // shift_size[0] + 1
-                    ) * (
-                    (height - window_size[1]) // shift_size[1] + 1
-                    ) * (
-                    (width - window_size[2]) // shift_size[2] + 1
-                    )`.
-
+        A tensor of shape (batch_size, num_windows, num_windows), where:
+            - batch_size: Assumed to be 1 in this function.
+            - num_windows: Total number of windows covering the entire input based on
+                the formula:
+                    (depth - window_size[0]) // shift_size[0] + 1) *
+                    (height - window_size[1]) // shift_size[1] + 1) *
+                    (width - window_size[2]) // shift_size[2] + 1)
+        Each element (attn_mask[i, j]) represents the attention weight between
+        window i and window j. A value of -100.0 indicates high negative attention
+        (preventing information flow), 0.0 indicates no mask effect.
     """  # noqa: E501
 
     img_mask = np.zeros((1, depth, height, width, 1))
@@ -241,10 +274,15 @@ def get_config(self):
     "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers"
 )
 class VideoSwinPatchingAndEmbedding(keras.Model):
-    """Video to Patch Embedding layer for Video Swin Model.
+    """Video to Patch Embedding layer for Video Swin Transformer models.
+
+    This layer performs the initial step in a Video Swin Transformer architecture by
+    partitioning the input video into 3D patches and embedding them into a vector
+    dimensional space.
 
     Args:
-        patch_size (int): Patch token size. Default: (2,4,4).
+        patch_size (int): Size of the patch along each dimension
+            (depth, height, width). Default: (2,4,4).
         embed_dim (int): Number of linear projection output channels. Default: 96.
         norm_layer (keras.layers, optional): Normalization layer. Default: None
 
@@ -321,10 +359,14 @@ def get_config(self):
 
 
 class VideoSwinPatchMerging(layers.Layer):
-    """Patch Merging Layer for Video Swin Model.
+    """Patch Merging Layer in Video Swin Transformer models.
+
+    This layer performs a downsampling step by merging four neighboring patches
+    from the previous layer into a single patch in the output. It achieves this
+    by concatenation and linear projection.
 
     Args:
-        input_dim (int): Number of input channels.
+        input_dim (int): Number of input channels in the feature maps.
         norm_layer (keras.layers, optional): Normalization layer.
             Default: LayerNormalization
 
@@ -392,10 +434,12 @@ def get_config(self):
 
 
 class VideoSwinWindowAttention(keras.Model):
-    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    """It tackles long-range video dependencies by splitting features into windows
+    and using relative position bias within each window for focused attention.
     It supports both of shifted and non-shifted window.
 
     Args:
+        input_dim (int): The number of input channels in the feature maps.
         window_size (tuple[int]): The temporal length, height and width of the window.
         num_heads (int): Number of attention heads.
         qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
@@ -554,7 +598,7 @@ def get_config(self):
 
 
 class VideoSwinBasicLayer(keras.Model):
-    """A basic Swin Transformer layer for one stage.
+    """A basic Video Swin Transformer layer for one stage.
 
     Args:
         input_dim (int): Number of feature channels
@@ -670,7 +714,7 @@ def build(self, input_shape):
     def compute_output_shape(self, input_shape):
         if self.downsample is not None:
             # TODO: remove tensorflow dependencies.
-            # GitHub issue: fix https://github.com/keras-team/keras/issues/19259 # noqa: E501
+            # GitHub issue: https://github.com/keras-team/keras/issues/19259 # noqa: E501
             output_shape = tf.TensorShape(
                 [
                     input_shape[0],
@@ -716,9 +760,9 @@ def get_config(self):
                 "depth": self.depth,
                 "qkv_bias": self.qkv_bias,
                 "qk_scale": self.qk_scale,
-                "drop": self.drop,
-                "attn_drop": self.attn_drop,
-                "drop_path": self.drop_path,
+                "drop_rate": self.drop_rate,
+                "attn_drop_rate": self.attn_drop_rate,
+                "drop_path_rate": self.drop_path_rate,
             }
         )
         return config
@@ -728,21 +772,25 @@ def get_config(self):
     "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers"
 )
 class VideoSwinTransformerBlock(keras.Model):
-    """Swin Transformer Block.
+    """Video Swin Transformer Block.
 
     Args:
         input_dim (int): Number of feature channels.
         num_heads (int): Number of attention heads.
-        window_size (tuple[int]): Window size.
-        shift_size (tuple[int]): Shift size for SW-MSA.
+        window_size (tuple[int]): Local window size. Default: (2, 7, 7)
+        shift_size (tuple[int]): Shift size for SW-MSA. Default: (0, 0, 0)
         mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+            Default: 4.0
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value.
+            Default: True
         qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+            Default: None
         drop (float, optional): Dropout rate. Default: 0.0
         attn_drop (float, optionalc): Attention dropout rate. Default: 0.0
         drop_path (float, optional): Stochastic depth rate. Default: 0.0
         act_layer (keras.layers.Activation, optional): Activation layer. Default: gelu
-        norm_layer (keras.layers, optional): Normalization layer.  Default: LayerNormalization
+        norm_layer (keras.layers, optional): Normalization layer.
+            Default: LayerNormalization
 
     References:
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 7786804c90..482d84c0f7 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -120,6 +120,16 @@ def presets(cls):
             "videoswin_base_kinetics400": copy.deepcopy(
                 backbone_presets["videoswin_base_kinetics400"]
             ),
+            # TODO: update: should these be here or separate class for each!
+            # "videoswin_base_kinetics400_imagenet22k": copy.deepcopy(
+            #     backbone_presets["videoswin_base_kinetics400_imagenet22k"]
+            # ),
+            # "videoswin_base_kinetics600_imagenet22k": copy.deepcopy(
+            #     backbone_presets["videoswin_base_kinetics600_imagenet22k"]
+            # ),
+            # "videoswin_base_something_something_v2": copy.deepcopy(
+            #     backbone_presets["videoswin_base_something_something_v2"]
+            # ),
         }
 
     @classproperty
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 07008bf0c8..084b5a074f 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -79,7 +79,7 @@ class VideoSwinBackbone(Backbone):
     model = VideoSwinSBackbone(
         include_rescaling=True, input_shape=(8, 256, 256, 3),
     )
-    videos = tf.ones((1, 8, 256, 256, 3))
+    videos = keras.ops.ones((1, 8, 256, 256, 3))
     outputs = model.predict(videos)
     ```
 
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
index bd06d137c3..5b8472f890 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -46,7 +46,7 @@
             "description": (
                 "A tiny Video Swin backbone architecture. "
                 "It is pretrained on ImageNet 1K dataset, and "
-                "trained on Kinetics 400 dataset."
+                "trained on Kinetics 400 dataset. "
             ),
             "params": 27_850_470,
             "official_name": "VideoSwinT",
@@ -58,7 +58,10 @@
             "description": (
                 "A small Video Swin backbone architecture. "
                 "It is pretrained on ImageNet 1K dataset, and "
-                "trained on Kinetics 400 dataset."
+                "trained on Kinetics 400 dataset. "
+                "Published weight is capable of scoring "
+                "80.6% top1 and 94.5% top5 accuracy on the "
+                "Kinetics 400 dataset"
             ),
             "params": 49_509_078,
             "official_name": "VideoSwinS",
@@ -70,19 +73,40 @@
             "description": (
                 "A base Video Swin backbone architecture. "
                 "It is pretrained on ImageNet 1K dataset, and "
-                "trained on Kinetics 400 dataset."
+                "trained on Kinetics 400 dataset. "
+                "Published weight is capable of scoring "
+                "80.6% top1 and 94.6% top5 accuracy on the "
+                "Kinetics 400 dataset"
             ),
             "params": 87_638_984,
             "official_name": "VideoSwinB",
             "path": "video_swin",
         },
     },
-    "videoswin_base_kinetics600": {
+    "videoswin_base_kinetics400_imagenet22k": {
         "metadata": {
             "description": (
                 "A base Video Swin backbone architecture. "
                 "It is pretrained on ImageNet 22K dataset, and "
-                "trained on Kinetics 600 dataset."
+                "trained on Kinetics 400 dataset. "
+                "Published weight is capable of scoring "
+                "82.7% top1 and 95.5% top5 accuracy on the "
+                "Kinetics 400 dataset"
+            ),
+            "params": 87_638_984,
+            "official_name": "VideoSwinB",
+            "path": "video_swin",
+        },
+    },
+    "videoswin_base_kinetics600_imagenet22k": {
+        "metadata": {
+            "description": (
+                "A base Video Swin backbone architecture. "
+                "It is pretrained on ImageNet 22K dataset, and "
+                "trained on Kinetics 600 dataset. "
+                "Published weight is capable of scoring "
+                "84.0% top1 and 96.5% top5 accuracy on the "
+                "Kinetics 600 dataset"
             ),
             "params": 87_638_984,
             "official_name": "VideoSwinB",
@@ -94,7 +118,10 @@
             "description": (
                 "A base Video Swin backbone architecture. "
                 "It is pretrained on Kinetics 400 dataset, and "
-                "trained on Something Something V2 dataset."
+                "trained on Something Something V2 dataset. "
+                "Published weight is capable of scoring "
+                "69.6% top1 and 92.7% top5 accuracy on the "
+                "Kinetics 400 dataset"
             ),
             "params": 87_638_984,
             "official_name": "VideoSwinB",
diff --git a/keras_cv/models/classification/video_classifier.py b/keras_cv/models/classification/video_classifier.py
index 6313c76977..ff0ac49dce 100644
--- a/keras_cv/models/classification/video_classifier.py
+++ b/keras_cv/models/classification/video_classifier.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -51,15 +51,15 @@ class VideoClassifier(Task):
     ```python
     input_data = keras.ops.ones(shape=(1, 32, 224, 224, 3))
 
-    # Pretrained classifier (e.g., for imagenet categories)
+    # Pretrained classifier (e.g., for kinetics categories)
     model = keras_cv.models.VideoClassifier.from_preset(
-        "videoswin_tiny_imagenet_classifier",
+        "videoswin_tiny_kinetics400_classifier",
     )
     output = model(input_data)
 
     # Pretrained backbone
     backbone = keras_cv.models.VideoSwinBackbone.from_preset(
-        "videoswin_tiny_imagenet",
+        "videoswin_tiny_kinetics400",
     )
     model = keras_cv.models.VideoClassifier(
         backbone=backbone,
@@ -69,7 +69,9 @@ class VideoClassifier(Task):
 
     # Randomly initialized backbone with a custom config
     model = keras_cv.models.VideoClassifier(
-        backbone=keras_cv.models.VideoSwinBackbone(),
+        backbone=keras_cv.models.VideoSwinBackbone(
+            include_rescaling=True
+        ),
         num_classes=400,
     )
     output = model(input_data)
diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py
index 9914e13433..e27163e01a 100644
--- a/keras_cv/models/classification/video_classifier_presets.py
+++ b/keras_cv/models/classification/video_classifier_presets.py
@@ -14,34 +14,92 @@
 """VideoClassifier Task presets."""
 
 classifier_presets = {
-    "videoswin_tiny_kinetics_classifier": {
+    "videoswin_tiny_kinetics400_classifier": {
         "metadata": {
-            "description": ("videoswin_tiny_kinetics "),
-            "params": 25_613_800,
+            "description": (
+                "A tiny Video Swin architecture. "
+                "It is pretrained on ImageNet 1K dataset, and "
+                "trained on Kinetics 400 dataset. "
+                "Published weight is capable of scoring "
+                "78.8% top1 and 93.6% top5 accuracy on the "
+                "Kinetics 400 dataset"
+            ),
+            "params": 28_158_070,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
     },
-    "videoswin_small_kinetics_classifier": {
+    "videoswin_small_kinetics400_classifier": {
         "metadata": {
-            "description": ("videoswin_small_kinetics "),
-            "params": 25_613_800,
+            "description": (
+                "A small Video Swin architecture. "
+                "It is pretrained on ImageNet 1K dataset, and "
+                "trained on Kinetics 400 dataset. "
+                "Published weight is capable of scoring "
+                "80.6% top1 and 94.5% top5 accuracy on the "
+                "Kinetics 400 dataset"
+            ),
+            "params": 49_816_678,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
     },
-    "videoswin_base_kinetics_classifier": {
+    "videoswin_base_kinetics400_classifier": {
         "metadata": {
-            "description": ("videoswin_base_kinetics "),
-            "params": 25_613_800,
+            "description": (
+                "A base Video Swin architecture. "
+                "It is pretrained on ImageNet 1K dataset, and "
+                "trained on Kinetics 400 dataset. "
+                "Published weight is capable of scoring "
+                "80.6% top1 and 94.6% top5 accuracy on the "
+                "Kinetics 400 dataset"
+            ),
+            "params": 89_065_688,
+            "official_name": "VideoClassifier",
+            "path": "video_classifier",
+        },
+    },
+    "videoswin_base_kinetics400_imagenet22k": {
+        "metadata": {
+            "description": (
+                "A base Video Swin architecture. "
+                "It is pretrained on ImageNet 22K dataset, and "
+                "trained on Kinetics 400 dataset. "
+                "Published weight is capable of scoring "
+                "82.7% top1 and 95.5% top5 accuracy on the "
+                "Kinetics 400 dataset"
+            ),
+            "params": 89_065_688,
+            "official_name": "VideoClassifier",
+            "path": "video_classifier",
+        },
+    },
+    "videoswin_base_kinetics600_imagenet22k": {
+        "metadata": {
+            "description": (
+                "A base Video Swin architecture. "
+                "It is pretrained on ImageNet 22K dataset, and "
+                "trained on Kinetics 600 dataset. "
+                "Published weight is capable of scoring "
+                "84.0% top1 and 96.5% top5 accuracy on the "
+                "Kinetics 600 dataset"
+            ),
+            "params": 89_270_688,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
     },
     "videoswin_base_something_something_v2_classifier": {
         "metadata": {
-            "description": ("videoswin_base_something_something_v2 "),
-            "params": 25_613_800,
+            "description": (
+                "A base Video Swin architecture. "
+                "It is pretrained on Kinetics 400 dataset, and "
+                "trained on Something Something V2 dataset. "
+                "Published weight is capable of scoring "
+                "69.6% top1 and 92.7% top5 accuracy on the "
+                "Kinetics 400 dataset"
+            ),
+            "params": 88_834_038,
             "official_name": "VideoClassifier",
             "path": "video_classifier",
         },
diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
index ac188e292c..7e3af58fce 100644
--- a/keras_cv/models/classification/video_classifier_test.py
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -24,7 +24,7 @@
 from keras_cv.backend import keras
 from keras_cv.backend import ops
 from keras_cv.models.backbones.video_swin.video_swin_backbone import (
-    VideoSwinBackbone,  # TODO: update with aliases
+    VideoSwinBackbone,  # TODO: update with aliases (kaggle handle)
 )
 from keras_cv.models.classification.video_classifier import VideoClassifier
 from keras_cv.tests.test_case import TestCase

From aab1a6c9ff3cae1022372ee35415eabfcd8d82e6 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 11 Mar 2024 00:14:53 +0600
Subject: [PATCH 67/94] fix jax DynamicJaxprTrace issue for

---
 keras_cv/layers/video_swin_layers.py | 71 ++++++++++++++++------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 98869a76f2..77eb7a77bc 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -393,24 +393,20 @@ def build(self, input_shape):
             self.norm.build(
                 (batch_size, depth, height // 2, width // 2, 4 * channel)
             )
-        self.built = True
-
-    def call(self, x):
-        input_shape = ops.shape(x)
-        height, width = (
-            input_shape[2],
-            input_shape[3],
-        )
 
-        # padding if needed
-        paddings = [
+        # compute padding if needed
+        self.pads = [
             [0, 0],
             [0, 0],
             [0, ops.mod(height, 2)],
             [0, ops.mod(width, 2)],
             [0, 0],
         ]
-        x = ops.pad(x, paddings)
+        self.built = True
+
+    def call(self, x):
+        # padding if needed
+        x = ops.pad(x, self.pads)
         x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
         x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
         x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
@@ -879,6 +875,21 @@ def build(self, input_shape):
             drop_rate=self.drop_rate,
         )
         self.mlp.build((*input_shape[:-1], self.input_dim))
+
+        # compute padding if needed.
+        # pad input feature maps to multiples of window size.
+        _, depth, height, width, _ = input_shape
+        pad_l = pad_t = pad_d0 = 0
+        self.pad_d1 = ops.mod(-depth + self.window_size[0], self.window_size[0])
+        self.pad_b = ops.mod(-height + self.window_size[1], self.window_size[1])
+        self.pad_r = ops.mod(-width + self.window_size[2], self.window_size[2])
+        self.pads = [
+            [0, 0],
+            [pad_d0, self.pad_d1],
+            [pad_t, self.pad_b],
+            [pad_l, self.pad_r],
+            [0, 0],
+        ]
         self.built = True
 
     def first_forward(self, x, mask_matrix, training):
@@ -890,22 +901,10 @@ def first_forward(self, x, mask_matrix, training):
             input_shape[3],
             input_shape[4],
         )
-        window_size, shift_size = self.window_size, self.shift_size
         x = self.norm1(x)
 
-        # pad feature maps to multiples of window size
-        pad_l = pad_t = pad_d0 = 0
-        pad_d1 = ops.mod(-depth + window_size[0], window_size[0])
-        pad_b = ops.mod(-height + window_size[1], window_size[1])
-        pad_r = ops.mod(-width + window_size[2], window_size[2])
-        paddings = [
-            [0, 0],
-            [pad_d0, pad_d1],
-            [pad_t, pad_b],
-            [pad_l, pad_r],
-            [0, 0],
-        ]
-        x = ops.pad(x, paddings)
+        # apply padding if needed.
+        x = ops.pad(x, self.pads)
 
         input_shape = ops.shape(x)
         depth_pad, height_pad, width_pad = (
@@ -918,7 +917,11 @@ def first_forward(self, x, mask_matrix, training):
         if self.apply_cyclic_shift:
             shifted_x = ops.roll(
                 x,
-                shift=(-shift_size[0], -shift_size[1], -shift_size[2]),
+                shift=(
+                    -self.shift_size[0],
+                    -self.shift_size[1],
+                    -self.shift_size[2],
+                ),
                 axis=(1, 2, 3),
             )
             attn_mask = mask_matrix
@@ -927,7 +930,7 @@ def first_forward(self, x, mask_matrix, training):
             attn_mask = None
 
         # partition windows
-        x_windows = window_partition(shifted_x, window_size)
+        x_windows = window_partition(shifted_x, self.window_size)
 
         # get attentions params
         attn_windows = self.attn(x_windows, mask=attn_mask, training=training)
@@ -935,7 +938,7 @@ def first_forward(self, x, mask_matrix, training):
         # reverse the swin windows
         shifted_x = window_reverse(
             attn_windows,
-            window_size,
+            self.window_size,
             batch_size,
             depth_pad,
             height_pad,
@@ -946,7 +949,11 @@ def first_forward(self, x, mask_matrix, training):
         if self.apply_cyclic_shift:
             x = ops.roll(
                 shifted_x,
-                shift=(shift_size[0], shift_size[1], shift_size[2]),
+                shift=(
+                    self.shift_size[0],
+                    self.shift_size[1],
+                    self.shift_size[2],
+                ),
                 axis=(1, 2, 3),
             )
         else:
@@ -954,8 +961,10 @@ def first_forward(self, x, mask_matrix, training):
 
         # pad if required
         do_pad = ops.logical_or(
-            ops.greater(pad_d1, 0),
-            ops.logical_or(ops.greater(pad_r, 0), ops.greater(pad_b, 0)),
+            ops.greater(self.pad_d1, 0),
+            ops.logical_or(
+                ops.greater(self.pad_r, 0), ops.greater(self.pad_b, 0)
+            ),
         )
         x = ops.cond(
             do_pad, lambda: x[:, :depth, :height, :width, :], lambda: x

From ac781085a5bfd73814ffbebeee8d80ed6f00c66a Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 12 Mar 2024 02:32:04 +0600
Subject: [PATCH 68/94] update config of backbone aliases

---
 .../video_swin/video_swin_aliases.py          | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 482d84c0f7..d30f50315f 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -45,11 +45,17 @@
 class VideoSwinTBackbone(VideoSwinBackbone):
     def __new__(
         cls,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
         include_rescaling=True,
         **kwargs,
     ):
         kwargs.update(
             {
+                "embed_dim": embed_dim,
+                "depths": depths,
+                "num_heads": num_heads,
                 "include_rescaling": include_rescaling,
             }
         )
@@ -74,11 +80,17 @@ def presets_with_weights(cls):
 class VideoSwinSBackbone(VideoSwinBackbone):
     def __new__(
         cls,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
         include_rescaling=True,
         **kwargs,
     ):
         kwargs.update(
             {
+                "embed_dim": embed_dim,
+                "depths": depths,
+                "num_heads": num_heads,
                 "include_rescaling": include_rescaling,
             }
         )
@@ -103,11 +115,17 @@ def presets_with_weights(cls):
 class VideoSwinBBackbone(VideoSwinBackbone):
     def __new__(
         cls,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
         include_rescaling=True,
         **kwargs,
     ):
         kwargs.update(
             {
+                "embed_dim": embed_dim,
+                "depths": depths,
+                "num_heads": num_heads,
                 "include_rescaling": include_rescaling,
             }
         )
@@ -120,16 +138,6 @@ def presets(cls):
             "videoswin_base_kinetics400": copy.deepcopy(
                 backbone_presets["videoswin_base_kinetics400"]
             ),
-            # TODO: update: should these be here or separate class for each!
-            # "videoswin_base_kinetics400_imagenet22k": copy.deepcopy(
-            #     backbone_presets["videoswin_base_kinetics400_imagenet22k"]
-            # ),
-            # "videoswin_base_kinetics600_imagenet22k": copy.deepcopy(
-            #     backbone_presets["videoswin_base_kinetics600_imagenet22k"]
-            # ),
-            # "videoswin_base_something_something_v2": copy.deepcopy(
-            #     backbone_presets["videoswin_base_something_something_v2"]
-            # ),
         }
 
     @classproperty

From 1dbded9d3f4cab48cde6e5bbfa02431afe0a2f8c Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 19 Mar 2024 00:28:11 +0600
Subject: [PATCH 69/94] add can run in mixed precision test

---
 .../backbones/video_swin/video_swin_backbone_test.py  | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 9adc543626..5d2ce24c1b 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -59,3 +59,14 @@ def test_fit(self):
         y = np.zeros((1, 16, 7, 7, 768))
         model.compile(optimizer="adam", loss="mse", metrics=["mse"])
         model.fit(x, y, epochs=1)
+
+    @pytest.mark.extra_large
+    def test_can_run_in_mixed_precision(self):
+        keras.mixed_precision.set_global_policy("mixed_float16")
+        model = VideoSwinBackbone(
+            include_rescaling=False, input_shape=(8, 224, 224, 3)
+        )
+        x = np.ones((1, 8, 224, 224, 3))
+        y = np.zeros((1, 4, 7, 7, 768))
+        model.compile(optimizer="adam", loss="mse", metrics=["mse"])
+        model.fit(x, y, epochs=1)

From 42003a2c7a784d504f76ef64ac8e1b01f26524a0 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 19 Mar 2024 00:39:08 +0600
Subject: [PATCH 70/94] add can run on gray video

---
 .../backbones/video_swin/video_swin_backbone_test.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 5d2ce24c1b..01e603e185 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -70,3 +70,15 @@ def test_can_run_in_mixed_precision(self):
         y = np.zeros((1, 4, 7, 7, 768))
         model.compile(optimizer="adam", loss="mse", metrics=["mse"])
         model.fit(x, y, epochs=1)
+
+    @pytest.mark.extra_large
+    def test_can_run_on_gray_video(self):
+        model = VideoSwinBackbone(
+            include_rescaling=False, 
+            input_shape=(96, 96, 96, 1), 
+            window_size=[6, 6, 6]
+        )
+        x = np.ones((1, 8, 224, 224, 3))
+        y = np.zeros((1, 48, 3, 3, 768))
+        model.compile(optimizer="adam", loss="mse", metrics=["mse"])
+        model.fit(x, y, epochs=1)

From e7313894921e92f09108831306e2550ee565124b Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Tue, 19 Mar 2024 00:42:32 +0600
Subject: [PATCH 71/94] minor fix

---
 .../models/backbones/video_swin/video_swin_aliases.py     | 2 +-
 .../backbones/video_swin/video_swin_backbone_presets.py   | 2 +-
 .../video_swin/video_swin_backbone_presets_test.py        | 2 +-
 .../backbones/video_swin/video_swin_backbone_test.py      | 8 ++++----
 .../models/classification/video_classifier_presets.py     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index d30f50315f..84233b0127 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -1,4 +1,4 @@
-# Copyright 202 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
index 5b8472f890..0b507274cc 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
index c8abba5c11..edd30fbba0 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_presets_test.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 01e603e185..8146917837 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -74,11 +74,11 @@ def test_can_run_in_mixed_precision(self):
     @pytest.mark.extra_large
     def test_can_run_on_gray_video(self):
         model = VideoSwinBackbone(
-            include_rescaling=False, 
-            input_shape=(96, 96, 96, 1), 
-            window_size=[6, 6, 6]
+            include_rescaling=False,
+            input_shape=(96, 96, 96, 1),
+            window_size=[6, 6, 6],
         )
-        x = np.ones((1, 8, 224, 224, 3))
+        x = np.ones((1, 96, 96, 96, 1))
         y = np.zeros((1, 48, 3, 3, 768))
         model.compile(optimizer="adam", loss="mse", metrics=["mse"])
         model.fit(x, y, epochs=1)
diff --git a/keras_cv/models/classification/video_classifier_presets.py b/keras_cv/models/classification/video_classifier_presets.py
index e27163e01a..6e34e43904 100644
--- a/keras_cv/models/classification/video_classifier_presets.py
+++ b/keras_cv/models/classification/video_classifier_presets.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The KerasCV Authors
+# Copyright 2024 The KerasCV Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 77197c2c6041dd524c0993a4b414dc0c8c4a208f Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 20 Mar 2024 12:08:35 +0600
Subject: [PATCH 72/94] specify axis in keras.ops.take to match with tf.gather

---
 keras_cv/layers/video_swin_layers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 77eb7a77bc..bf2adae8cf 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -545,6 +545,7 @@ def call(self, x, mask=None, training=None):
         rel_pos_bias = ops.take(
             self.relative_position_bias_table,
             self.relative_position_index[:depth, :depth],
+            axis=0,
         )
         rel_pos_bias = ops.reshape(rel_pos_bias, [depth, depth, -1])
         rel_pos_bias = ops.transpose(rel_pos_bias, [2, 0, 1])

From aa2006792ffa925b4cc1f5ef45339d6034838a94 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 01:44:11 +0600
Subject: [PATCH 73/94] specify include rescaling to backbone class

---
 keras_cv/models/backbones/video_swin/video_swin_backbone.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 084b5a074f..7aaee48263 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -91,7 +91,7 @@ class VideoSwinBackbone(Backbone):
     def __init__(
         self,
         *,
-        include_rescaling,
+        include_rescaling=False,
         input_shape=(32, 224, 224, 3),
         input_tensor=None,
         embed_dim=96,

From 11f33d791374314899c16589d3540285518d3105 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 01:46:19 +0600
Subject: [PATCH 74/94] remove shift size form get config of video basic layer

---
 keras_cv/layers/video_swin_layers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index bf2adae8cf..acba25b435 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -753,7 +753,6 @@ def get_config(self):
                 "window_size": self.window_size,
                 "num_heads": self.num_heads,
                 "mlp_ratio": self.mlp_ratio,
-                "shift_size": self.shift_size,
                 "depth": self.depth,
                 "qkv_bias": self.qkv_bias,
                 "qk_scale": self.qk_scale,

From a2961b9401ed77521ef7007406941f3189d074aa Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 01:49:22 +0600
Subject: [PATCH 75/94] add support arbitrary input shape

---
 keras_cv/layers/video_swin_layers.py           | 18 ++++++------------
 .../video_swin/video_swin_backbone.py          |  6 ------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index acba25b435..7da58885ce 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -418,6 +418,10 @@ def call(self, x):
 
         x = self.reduction(x)
         return x
+    
+    def compute_output_shape(self, input_shape):
+        batch_size, depth, height, width, _ = input_shape
+        return (batch_size, depth, height // 2, width // 2, 2 * self.input_dim)
 
     def get_config(self):
         config = super().get_config()
@@ -710,18 +714,8 @@ def build(self, input_shape):
 
     def compute_output_shape(self, input_shape):
         if self.downsample is not None:
-            # TODO: remove tensorflow dependencies.
-            # GitHub issue: https://github.com/keras-team/keras/issues/19259 # noqa: E501
-            output_shape = tf.TensorShape(
-                [
-                    input_shape[0],
-                    self.depth_pad,
-                    self.height_pad // 2,
-                    self.width_pad // 2,
-                    2 * self.input_dim,
-                ]
-            )
-            return output_shape
+            input_shape = self.downsample.compute_output_shape(input_shape)
+            return input_shape
 
         return input_shape
 
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 7aaee48263..beff9cebee 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -123,12 +123,6 @@ def __init__(
                 "Depth, height and width of the video must be specified"
                 " in `input_shape`."
             )
-        if input_spec.shape[-3] != input_spec.shape[-2]:
-            raise ValueError(
-                "Input video must be square i.e. the height must"
-                " be equal to the width in the `input_shape`"
-                " tuple/tensor."
-            )
 
         x = input_spec
 

From 49b074a94cd259a20138ed8a4d672f70b7fabe08 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 01:58:42 +0600
Subject: [PATCH 76/94] minor updates to swin layers

---
 keras_cv/layers/video_swin_layers.py          | 52 +++++++------------
 .../video_swin/video_swin_backbone.py         |  2 +-
 2 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 7da58885ce..56037b0cd6 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -299,19 +299,25 @@ def __init__(
         self.embed_dim = embed_dim
         self.norm_layer = norm_layer
 
-    def _compute_padding(self, dim, patch_size):
+    def __compute_padding(self, dim, patch_size):
         pad_amount = patch_size - (dim % patch_size)
         return [0, pad_amount if pad_amount != patch_size else 0]
 
     def build(self, input_shape):
         self.pads = [
             [0, 0],
-            self._compute_padding(input_shape[1], self.patch_size[0]),
-            self._compute_padding(input_shape[2], self.patch_size[1]),
-            self._compute_padding(input_shape[3], self.patch_size[2]),
+            self.__compute_padding(input_shape[1], self.patch_size[0]),
+            self.__compute_padding(input_shape[2], self.patch_size[1]),
+            self.__compute_padding(input_shape[3], self.patch_size[2]),
             [0, 0],
         ]
 
+        if self.norm_layer is not None:
+            self.norm = self.norm_layer(
+                axis=-1, epsilon=1e-5, name="embed_norm"
+            )
+            self.norm.build((None, None, None, None, self.embed_dim))
+
         self.proj = layers.Conv3D(
             self.embed_dim,
             kernel_size=self.patch_size,
@@ -319,13 +325,6 @@ def build(self, input_shape):
             name="embed_proj",
         )
         self.proj.build((None, None, None, None, input_shape[-1]))
-
-        self.norm = None
-        if self.norm_layer is not None:
-            self.norm = self.norm_layer(
-                axis=-1, epsilon=1e-5, name="embed_norm"
-            )
-            self.norm.build((None, None, None, None, self.embed_dim))
         self.built = True
 
     def call(self, x):
@@ -337,16 +336,6 @@ def call(self, x):
 
         return x
 
-    def compute_output_shape(self, input_shape):
-        spatial_dims = [
-            (dim - self.patch_size[i]) // self.patch_size[i] + 1
-            for i, dim in enumerate(input_shape[1:-1])
-        ]
-        output_shape = (
-            (input_shape[0],) + tuple(spatial_dims) + (self.embed_dim,)
-        )
-        return output_shape
-
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -387,7 +376,6 @@ def build(self, input_shape):
             (batch_size, depth, height // 2, width // 2, 4 * channel)
         )
 
-        self.norm = None
         if self.norm_layer is not None:
             self.norm = self.norm_layer(axis=-1, epsilon=1e-5)
             self.norm.build(
@@ -633,7 +621,7 @@ def __init__(
         attn_drop_rate=0.0,
         drop_path_rate=0.0,
         norm_layer=None,
-        downsample=None,
+        downsampling_layer=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -649,9 +637,9 @@ def __init__(
         self.attn_drop_rate = attn_drop_rate
         self.drop_path_rate = drop_path_rate
         self.norm_layer = norm_layer
-        self.downsample = downsample
+        self.downsampling_layer = downsampling_layer
 
-    def _compute_dim_padded(self, input_dim, window_dim_size):
+    def __compute_dim_padded(self, input_dim, window_dim_size):
         input_dim = ops.cast(input_dim, dtype="float32")
         window_dim_size = ops.cast(window_dim_size, dtype="float32")
         return ops.cast(
@@ -662,13 +650,13 @@ def build(self, input_shape):
         self.window_size, self.shift_size = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
         )
-        self.depth_pad = self._compute_dim_padded(
+        self.depth_pad = self.__compute_dim_padded(
             input_shape[1], self.window_size[0]
         )
-        self.height_pad = self._compute_dim_padded(
+        self.height_pad = self.__compute_dim_padded(
             input_shape[2], self.window_size[1]
         )
-        self.width_pad = self._compute_dim_padded(
+        self.width_pad = self.__compute_dim_padded(
             input_shape[3], self.window_size[2]
         )
         self.attn_mask = compute_mask(
@@ -701,8 +689,8 @@ def build(self, input_shape):
             for i in range(self.depth)
         ]
 
-        if self.downsample is not None:
-            self.downsample = self.downsample(
+        if self.downsampling_layer is not None:
+            self.downsample = self.downsampling_layer(
                 input_dim=self.input_dim, norm_layer=self.norm_layer
             )
             self.downsample.build(input_shape)
@@ -713,7 +701,7 @@ def build(self, input_shape):
         self.built = True
 
     def compute_output_shape(self, input_shape):
-        if self.downsample is not None:
+        if self.downsampling_layer is not None:
             input_shape = self.downsample.compute_output_shape(input_shape)
             return input_shape
 
@@ -734,7 +722,7 @@ def call(self, x, training=None):
 
         x = ops.reshape(x, [batch_size, depth, height, width, channel])
 
-        if self.downsample is not None:
+        if self.downsampling_layer is not None:
             x = self.downsample(x)
 
         return x
diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index beff9cebee..0949d76071 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -166,7 +166,7 @@ def __init__(
                 attn_drop_rate=attn_drop_rate,
                 drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])],
                 norm_layer=norm_layer,
-                downsample=(
+                downsampling_layer=(
                     VideoSwinPatchMerging if (i < num_layers - 1) else None
                 ),
                 name=f"videoswin_basic_layer_{i + 1}",

From 204e4b1cfb338eacc55dc922f8ec803b4dc17f4c Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 02:03:38 +0600
Subject: [PATCH 77/94] test method update for swin layers

---
 keras_cv/layers/video_swin_layers_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/layers/video_swin_layers_test.py
index 4e72cb5e9a..0e9e71d00f 100644
--- a/keras_cv/layers/video_swin_layers_test.py
+++ b/keras_cv/layers/video_swin_layers_test.py
@@ -25,9 +25,9 @@ def test_patch_embedding_compute_output_shape(self):
         patch_embedding_model = VideoSwinPatchingAndEmbedding(
             patch_size=(2, 4, 4), embed_dim=96, norm_layer=None
         )
-        input_shape = (None, 16, 32, 32, 3)
-        output_shape = patch_embedding_model.compute_output_shape(input_shape)
-        expected_output_shape = (None, 8, 8, 8, 96)
+        input_array = ops.ones(shape=(1, 16, 32, 32, 3))
+        output_shape = patch_embedding_model(input_array).shape
+        expected_output_shape = (1, 8, 8, 8, 96)
         self.assertEqual(output_shape, expected_output_shape)
 
     def test_patch_embedding_get_config(self):

From 251495b8469ad1e5c210a918b7ca3ce3356caa5a Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 02:09:15 +0600
Subject: [PATCH 78/94] update test method to swin backbone

---
 .../backbones/video_swin/video_swin_backbone_test.py   | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 8146917837..1032f2d1fe 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -82,3 +82,13 @@ def test_can_run_on_gray_video(self):
         y = np.zeros((1, 48, 3, 3, 768))
         model.compile(optimizer="adam", loss="mse", metrics=["mse"])
         model.fit(x, y, epochs=1)
+
+    @pytest.mark.extra_large
+    def test_can_run_non_square_shape(self):
+        input_batch = np.ones(shape=(2, 8, 224, 256, 3))
+        model = VideoSwinBackbone(
+            input_shape=(8, 224, 256, 3),
+            include_rescaling=False,
+            num_classes=10,
+        )
+        model(input_batch)

From 599d48129a81a6d90ca43290a87642a1747b4b07 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 02:12:12 +0600
Subject: [PATCH 79/94] remove unsed code

---
 keras_cv/layers/video_swin_layers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 56037b0cd6..9821a725b7 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import numpy as np
-import tensorflow as tf
 from keras import layers
 
 from keras_cv.api_export import keras_cv_export
@@ -406,7 +405,7 @@ def call(self, x):
 
         x = self.reduction(x)
         return x
-    
+
     def compute_output_shape(self, input_shape):
         batch_size, depth, height, width, _ = input_shape
         return (batch_size, depth, height // 2, width // 2, 2 * self.input_dim)

From a849b387a3210cd74ae1df9a08e79d29017a6230 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 02:29:48 +0600
Subject: [PATCH 80/94] bug fix in call method of patch embed layer

---
 keras_cv/layers/video_swin_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 9821a725b7..c5edf7128e 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -330,7 +330,7 @@ def call(self, x):
         x = ops.pad(x, self.pads)
         x = self.proj(x)
 
-        if self.norm is not None:
+        if self.norm_layer is not None:
             x = self.norm(x)
 
         return x

From f611b0e2d5fbb386f45fc4aa7f4db24d24a3b39f Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 02:50:21 +0600
Subject: [PATCH 81/94] fix typo in patch merging layer

---
 keras_cv/layers/video_swin_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index c5edf7128e..12b94ec198 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -400,7 +400,7 @@ def call(self, x):
         x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
         x = ops.concatenate([x0, x1, x2, x3], axis=-1)  # B D H/2 W/2 4*C
 
-        if self.norm is not None:
+        if self.norm_layer is not None:
             x = self.norm(x)
 
         x = self.reduction(x)

From b7d26e4ddf912531e0a4b12085bb0cb8b6dd5885 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 04:43:19 +0600
Subject: [PATCH 82/94] minor fix

---
 keras_cv/models/backbones/video_swin/video_swin_backbone_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
index 1032f2d1fe..0e049ea395 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone_test.py
@@ -89,6 +89,5 @@ def test_can_run_non_square_shape(self):
         model = VideoSwinBackbone(
             input_shape=(8, 224, 256, 3),
             include_rescaling=False,
-            num_classes=10,
         )
         model(input_batch)

From e3e02dc3fdc474bc25845ef940c42183cd05b252 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 22:09:38 +0600
Subject: [PATCH 83/94] fix keras.ops.cond issue with jax

---
 keras_cv/layers/video_swin_layers.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 12b94ec198..23fb3e63e0 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -871,6 +871,9 @@ def build(self, input_shape):
             [pad_l, self.pad_r],
             [0, 0],
         ]
+        self.do_pad = any(
+            value > 0 for value in (self.pad_d1, self.pad_r, self.pad_b)
+        )
         self.built = True
 
     def first_forward(self, x, mask_matrix, training):
@@ -941,15 +944,8 @@ def first_forward(self, x, mask_matrix, training):
             x = shifted_x
 
         # pad if required
-        do_pad = ops.logical_or(
-            ops.greater(self.pad_d1, 0),
-            ops.logical_or(
-                ops.greater(self.pad_r, 0), ops.greater(self.pad_b, 0)
-            ),
-        )
-        x = ops.cond(
-            do_pad, lambda: x[:, :depth, :height, :width, :], lambda: x
-        )
+        if self.do_pad:
+            return x[:, :depth, :height, :width, :]
 
         return x
 

From a626b1f680a4f8e5a7c8e5a45417c81c7082bd96 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 23:07:50 +0600
Subject: [PATCH 84/94] no test for jit compile in torch

---
 keras_cv/models/classification/video_classifier_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
index 7e3af58fce..94843aac74 100644
--- a/keras_cv/models/classification/video_classifier_test.py
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -51,6 +51,9 @@ def test_valid_call(self):
     )
     @pytest.mark.large  # Fit is slow, so mark these large.
     def test_classifier_fit(self, jit_compile):
+        if jit_compile and keras.backend.backend() == "torch":
+            self.skipTest("TODO: Torch Backend `jit_compile` fails on GPU.")
+            self.supports_jit = False
         model = VideoClassifier(
             backbone=VideoSwinBackbone(
                 input_shape=(8, 224, 224, 3), include_rescaling=True

From c484445745a5aaf96f87e328a55ac72a5154b7b2 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Mon, 25 Mar 2024 23:25:47 +0600
Subject: [PATCH 85/94] reduce tensor size for forward test

---
 keras_cv/models/classification/video_classifier_test.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/keras_cv/models/classification/video_classifier_test.py b/keras_cv/models/classification/video_classifier_test.py
index 94843aac74..b3d658dbef 100644
--- a/keras_cv/models/classification/video_classifier_test.py
+++ b/keras_cv/models/classification/video_classifier_test.py
@@ -72,6 +72,7 @@ def test_classifier_fit(self, jit_compile):
         ("avg_pooling", "avg"), ("max_pooling", "max")
     )
     def test_pooling_arg_call(self, pooling):
+        input_batch = np.ones(shape=(2, 8, 224, 224, 3))
         model = VideoClassifier(
             backbone=VideoSwinBackbone(
                 input_shape=(8, 224, 224, 3), include_rescaling=True
@@ -79,17 +80,18 @@ def test_pooling_arg_call(self, pooling):
             num_classes=10,
             pooling=pooling,
         )
-        model(self.input_batch)
+        model(input_batch)
 
     @pytest.mark.large  # Saving is slow, so mark these large.
     def test_saved_model(self):
+        input_batch = np.ones(shape=(2, 8, 224, 224, 3))
         model = VideoClassifier(
             backbone=VideoSwinBackbone(
                 input_shape=(8, 224, 224, 3), include_rescaling=False
             ),
             num_classes=10,
         )
-        model_output = model(self.input_batch)
+        model_output = model(input_batch)
         save_path = os.path.join(self.get_temp_dir(), "video_classifier.keras")
         model.save(save_path)
         restored_model = keras.models.load_model(save_path)
@@ -98,7 +100,7 @@ def test_saved_model(self):
         self.assertIsInstance(restored_model, VideoClassifier)
 
         # Check that output matches.
-        restored_output = restored_model(self.input_batch)
+        restored_output = restored_model(input_batch)
         self.assertAllClose(
             ops.convert_to_numpy(model_output),
             ops.convert_to_numpy(restored_output),

From 45945c96933141766ecb62136bfd056d5c7966b6 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Thu, 28 Mar 2024 17:02:26 +0600
Subject: [PATCH 86/94] minor fix

---
 keras_cv/layers/video_swin_layers.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 23fb3e63e0..e6d930f91a 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -820,10 +820,7 @@ def build(self, input_shape):
         self.window_size, self.shift_size = get_window_size(
             input_shape[1:-1], self.window_size, self.shift_size
         )
-
-        self.apply_cyclic_shift = False
-        if any(i > 0 for i in self.shift_size):
-            self.apply_cyclic_shift = True
+        self.apply_cyclic_shift = any(i > 0 for i in self.shift_size)
 
         # layers
         self.drop_path = (
@@ -871,7 +868,7 @@ def build(self, input_shape):
             [pad_l, self.pad_r],
             [0, 0],
         ]
-        self.do_pad = any(
+        self.apply_pad = any(
             value > 0 for value in (self.pad_d1, self.pad_r, self.pad_b)
         )
         self.built = True
@@ -944,7 +941,7 @@ def first_forward(self, x, mask_matrix, training):
             x = shifted_x
 
         # pad if required
-        if self.do_pad:
+        if self.apply_pad:
             return x[:, :depth, :height, :width, :]
 
         return x

From f866d12cdcae6ff38990d2e1c5b40debe8c1e05b Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 31 Mar 2024 12:06:51 +0600
Subject: [PATCH 87/94] remove kcv export decorator

---
 keras_cv/layers/video_swin_layers.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index e6d930f91a..0e89ab6e27 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -269,9 +269,6 @@ def get_config(self):
         return config
 
 
-@keras_cv_export(
-    "keras_cv.layers.VideoSwinPatchingAndEmbedding", package="keras_cv.layers"
-)
 class VideoSwinPatchingAndEmbedding(keras.Model):
     """Video to Patch Embedding layer for Video Swin Transformer models.
 
@@ -745,9 +742,6 @@ def get_config(self):
         return config
 
 
-@keras_cv_export(
-    "keras_cv.layers.VideoSwinTransformerBlock", package="keras_cv.layers"
-)
 class VideoSwinTransformerBlock(keras.Model):
     """Video Swin Transformer Block.
 

From bfb62a47cc1147097fa5fde53abcb1c428796988 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 31 Mar 2024 12:10:50 +0600
Subject: [PATCH 88/94] update keras.Layer import

---
 keras_cv/layers/video_swin_layers.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 0e89ab6e27..8f58f230b2 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-from keras import layers
 
-from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
 from keras_cv.backend import ops
 from keras_cv.layers import DropPath
@@ -214,7 +212,7 @@ def compute_mask(depth, height, width, window_size, shift_size):
     return attn_mask
 
 
-class MLP(layers.Layer):
+class MLP(keras.Layer):
     """A Multilayer perceptron(MLP) layer.
 
     Args:
@@ -343,7 +341,7 @@ def get_config(self):
         return config
 
 
-class VideoSwinPatchMerging(layers.Layer):
+class VideoSwinPatchMerging(keras.Layer):
     """Patch Merging Layer in Video Swin Transformer models.
 
     This layer performs a downsampling step by merging four neighboring patches

From 57f0012786e2e92314a3a45ff56bc02b3e495d58 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 31 Mar 2024 13:34:00 +0600
Subject: [PATCH 89/94] remove unused layer import

---
 keras_cv/layers/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py
index ae4f6007f5..0bfa2aa8ec 100644
--- a/keras_cv/layers/__init__.py
+++ b/keras_cv/layers/__init__.py
@@ -135,9 +135,6 @@
 )
 from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling
 from keras_cv.layers.transformer_encoder import TransformerEncoder
-from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
-from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
-from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
 from keras_cv.layers.vit_det_layers import AddRelativePositionalEmbedding
 from keras_cv.layers.vit_det_layers import MultiHeadAttentionWithRelativePE
 from keras_cv.layers.vit_det_layers import ViTDetPatchingAndEmbedding

From 7602052986cf38508d093603fd9c702ce269a5e4 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 31 Mar 2024 13:42:19 +0600
Subject: [PATCH 90/94] replace keras.layers instead of layers

---
 keras_cv/layers/video_swin_layers.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index 8f58f230b2..a265052c23 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -236,10 +236,10 @@ def __init__(
         self.hidden_dim = hidden_dim
         self._activation_identifier = activation
         self.drop_rate = drop_rate
-        self.activation = layers.Activation(self._activation_identifier)
-        self.fc1 = layers.Dense(self.hidden_dim)
-        self.fc2 = layers.Dense(self.output_dim)
-        self.dropout = layers.Dropout(self.drop_rate)
+        self.activation = keras.layers.Activation(self._activation_identifier)
+        self.fc1 = keras.layers.Dense(self.hidden_dim)
+        self.fc2 = keras.layers.Dense(self.output_dim)
+        self.dropout = keras.layers.Dropout(self.drop_rate)
 
     def build(self, input_shape):
         self.fc1.build(input_shape)
@@ -312,7 +312,7 @@ def build(self, input_shape):
             )
             self.norm.build((None, None, None, None, self.embed_dim))
 
-        self.proj = layers.Conv3D(
+        self.proj = keras.layers.Conv3D(
             self.embed_dim,
             kernel_size=self.patch_size,
             strides=self.patch_size,
@@ -365,7 +365,7 @@ def __init__(self, input_dim, norm_layer=None, **kwargs):
 
     def build(self, input_shape):
         batch_size, depth, height, width, channel = input_shape
-        self.reduction = layers.Dense(2 * self.input_dim, use_bias=False)
+        self.reduction = keras.layers.Dense(2 * self.input_dim, use_bias=False)
         self.reduction.build(
             (batch_size, depth, height // 2, width // 2, 4 * channel)
         )
@@ -500,10 +500,12 @@ def build(self, input_shape):
         )
 
         # layers
-        self.qkv = layers.Dense(self.input_dim * 3, use_bias=self.qkv_bias)
-        self.attn_drop = layers.Dropout(self.attn_drop_rate)
-        self.proj = layers.Dense(self.input_dim)
-        self.proj_drop = layers.Dropout(self.proj_drop_rate)
+        self.qkv = keras.layers.Dense(
+            self.input_dim * 3, use_bias=self.qkv_bias
+        )
+        self.attn_drop = keras.layers.Dropout(self.attn_drop_rate)
+        self.proj = keras.layers.Dense(self.input_dim)
+        self.proj_drop = keras.layers.Dropout(self.proj_drop_rate)
         self.qkv.build(input_shape)
         self.proj.build(input_shape)
         self.built = True
@@ -779,7 +781,7 @@ def __init__(
         attn_drop_rate=0.0,
         drop_path_rate=0.0,
         activation="gelu",
-        norm_layer=layers.LayerNormalization,
+        norm_layer=keras.layers.LayerNormalization,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -818,7 +820,7 @@ def build(self, input_shape):
         self.drop_path = (
             DropPath(self.drop_path_rate)
             if self.drop_path_rate > 0.0
-            else layers.Identity()
+            else keras.layers.Identity()
         )
 
         self.norm1 = self.norm_layer(axis=-1, epsilon=1e-05)

From 837286dc3098de8d6950974b34f41cfdf5a0e4c2 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 31 Mar 2024 14:16:23 +0600
Subject: [PATCH 91/94] update keras.Layer to keras.layers.Layer for keras2

---
 keras_cv/layers/video_swin_layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/layers/video_swin_layers.py
index a265052c23..7aad7391ec 100644
--- a/keras_cv/layers/video_swin_layers.py
+++ b/keras_cv/layers/video_swin_layers.py
@@ -212,7 +212,7 @@ def compute_mask(depth, height, width, window_size, shift_size):
     return attn_mask
 
 
-class MLP(keras.Layer):
+class MLP(keras.layers.Layer):
     """A Multilayer perceptron(MLP) layer.
 
     Args:
@@ -341,7 +341,7 @@ def get_config(self):
         return config
 
 
-class VideoSwinPatchMerging(keras.Layer):
+class VideoSwinPatchMerging(keras.layers.Layer):
     """Patch Merging Layer in Video Swin Transformer models.
 
     This layer performs a downsampling step by merging four neighboring patches

From 6d44ecac284848a1bc17be6f3d8013d0278ffbbc Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Sun, 31 Mar 2024 17:36:20 +0600
Subject: [PATCH 92/94] add window_size param to aliases

---
 keras_cv/models/backbones/video_swin/video_swin_aliases.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_aliases.py b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
index 84233b0127..161ba0bbb4 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_aliases.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_aliases.py
@@ -48,6 +48,7 @@ def __new__(
         embed_dim=96,
         depths=[2, 2, 6, 2],
         num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
         include_rescaling=True,
         **kwargs,
     ):
@@ -56,6 +57,7 @@ def __new__(
                 "embed_dim": embed_dim,
                 "depths": depths,
                 "num_heads": num_heads,
+                "window_size": window_size,
                 "include_rescaling": include_rescaling,
             }
         )
@@ -83,6 +85,7 @@ def __new__(
         embed_dim=96,
         depths=[2, 2, 18, 2],
         num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
         include_rescaling=True,
         **kwargs,
     ):
@@ -91,6 +94,7 @@ def __new__(
                 "embed_dim": embed_dim,
                 "depths": depths,
                 "num_heads": num_heads,
+                "window_size": window_size,
                 "include_rescaling": include_rescaling,
             }
         )
@@ -118,6 +122,7 @@ def __new__(
         embed_dim=128,
         depths=[2, 2, 18, 2],
         num_heads=[4, 8, 16, 32],
+        window_size=[8, 7, 7],
         include_rescaling=True,
         **kwargs,
     ):
@@ -126,6 +131,7 @@ def __new__(
                 "embed_dim": embed_dim,
                 "depths": depths,
                 "num_heads": num_heads,
+                "window_size": window_size,
                 "include_rescaling": include_rescaling,
             }
         )

From f5dce04d92cfeb20543a0bb69a549f0303473a42 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 3 Apr 2024 05:30:01 +0600
Subject: [PATCH 93/94] move vide swin layer to model specific directory

---
 keras_cv/models/backbones/video_swin/video_swin_backbone.py | 6 +++---
 .../backbones/video_swin}/video_swin_layers.py              | 0
 .../backbones/video_swin}/video_swin_layers_test.py         | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename keras_cv/{layers => models/backbones/video_swin}/video_swin_layers.py (100%)
 rename keras_cv/{layers => models/backbones/video_swin}/video_swin_layers_test.py (91%)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index 0949d76071..c456e4fb74 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -19,9 +19,9 @@
 
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
-from keras_cv.layers.video_swin_layers import VideoSwinBasicLayer
-from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
-from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
+from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinBasicLayer
+from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding
+from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging
 from keras_cv.models import utils
 from keras_cv.models.backbones.backbone import Backbone
 from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import (  # noqa: E501
diff --git a/keras_cv/layers/video_swin_layers.py b/keras_cv/models/backbones/video_swin/video_swin_layers.py
similarity index 100%
rename from keras_cv/layers/video_swin_layers.py
rename to keras_cv/models/backbones/video_swin/video_swin_layers.py
diff --git a/keras_cv/layers/video_swin_layers_test.py b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py
similarity index 91%
rename from keras_cv/layers/video_swin_layers_test.py
rename to keras_cv/models/backbones/video_swin/video_swin_layers_test.py
index 0e9e71d00f..b8dc784ed1 100644
--- a/keras_cv/layers/video_swin_layers_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py
@@ -14,9 +14,9 @@
 
 
 from keras_cv.backend import ops
-from keras_cv.layers.video_swin_layers import VideoSwinPatchingAndEmbedding
-from keras_cv.layers.video_swin_layers import VideoSwinPatchMerging
-from keras_cv.layers.video_swin_layers import VideoSwinWindowAttention
+from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding
+from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging
+from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinWindowAttention
 from keras_cv.tests.test_case import TestCase
 
 

From 0ba9fdf79f75c4b87758e12309549a438a32e1d9 Mon Sep 17 00:00:00 2001
From: innat <innat.dev@gmail.com>
Date: Wed, 3 Apr 2024 05:38:46 +0600
Subject: [PATCH 94/94] minor fix

---
 .../backbones/video_swin/video_swin_backbone.py      | 12 +++++++++---
 .../backbones/video_swin/video_swin_layers_test.py   | 12 +++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/keras_cv/models/backbones/video_swin/video_swin_backbone.py b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
index c456e4fb74..9bb62eb385 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_backbone.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_backbone.py
@@ -19,9 +19,6 @@
 
 from keras_cv.api_export import keras_cv_export
 from keras_cv.backend import keras
-from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinBasicLayer
-from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding
-from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging
 from keras_cv.models import utils
 from keras_cv.models.backbones.backbone import Backbone
 from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import (  # noqa: E501
@@ -30,6 +27,15 @@
 from keras_cv.models.backbones.video_swin.video_swin_backbone_presets import (  # noqa: E501
     backbone_presets_with_weights,
 )
+from keras_cv.models.backbones.video_swin.video_swin_layers import (
+    VideoSwinBasicLayer,
+)
+from keras_cv.models.backbones.video_swin.video_swin_layers import (
+    VideoSwinPatchingAndEmbedding,
+)
+from keras_cv.models.backbones.video_swin.video_swin_layers import (
+    VideoSwinPatchMerging,
+)
 from keras_cv.utils.python_utils import classproperty
 
 
diff --git a/keras_cv/models/backbones/video_swin/video_swin_layers_test.py b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py
index b8dc784ed1..c0b540d1c0 100644
--- a/keras_cv/models/backbones/video_swin/video_swin_layers_test.py
+++ b/keras_cv/models/backbones/video_swin/video_swin_layers_test.py
@@ -14,9 +14,15 @@
 
 
 from keras_cv.backend import ops
-from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchingAndEmbedding
-from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinPatchMerging
-from keras_cv.models.backbones.video_swin.video_swin_layers import VideoSwinWindowAttention
+from keras_cv.models.backbones.video_swin.video_swin_layers import (
+    VideoSwinPatchingAndEmbedding,
+)
+from keras_cv.models.backbones.video_swin.video_swin_layers import (
+    VideoSwinPatchMerging,
+)
+from keras_cv.models.backbones.video_swin.video_swin_layers import (
+    VideoSwinWindowAttention,
+)
 from keras_cv.tests.test_case import TestCase