diff --git a/pyproject.toml b/pyproject.toml
index 91f8c3e..7956a36 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,12 +4,12 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "swarms-torch"
-version = "0.2.1"
+version = "0.2.2"
 description = "swarms-torch - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
 homepage = "https://github.com/kyegomez/swarms-pytorch"
-documentation = ""  # Add this if you have documentation.
+documentation = "https://github.com/kyegomez/swarms-pytorch"  # Add this if you have documentation.
 readme = "README.md"  # Assuming you have a README.md
 repository = "https://github.com/kyegomez/swarms-pytorch"
 keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"]
@@ -29,10 +29,13 @@ packages = [
 
 [tool.poetry.dependencies]
 python = "^3.6"
-torch = "2.1.2"
-einops = "0.7.0"
-zetascale = "1.4.4"
-pytest = "7.4.2"
+torch = "*"
+einops = "*"
+zetascale = "*"
+pytest = "*"
+torchvision = "*"
+loguru = "*"
+einx = "*"
 
 
 
diff --git a/requirements.txt b/requirements.txt
index 32271e9..ef388af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
-torch==2.1.2
-einops==0.7.0
-pandas==2.2.1
-zetascale==1.4.4
-pytest==7.4.2
-mkdocs
-mkdocs-material
-mkdocs-glightbox
+python
+torch
+einops
+zetascale
+pytest
+torchvision
+loguru
+einx
+
diff --git a/swarms_torch/ant_colony_swarm.py b/swarms_torch/ant_colony_swarm.py
index 65c7eb6..487e650 100644
--- a/swarms_torch/ant_colony_swarm.py
+++ b/swarms_torch/ant_colony_swarm.py
@@ -1,7 +1,7 @@
 import torch
+from torch import nn
 
-
-class AntColonyOptimization:
+class AntColonyOptimization(nn.Module):
     """
     Ant Colony Optimization
     Overview: https://en.wikipedia.org/wiki/Ant_colony_optimization_algorithms
diff --git a/swarms_torch/drone_swarm.py b/swarms_torch/drone_swarm.py
deleted file mode 100644
index 85768d4..0000000
--- a/swarms_torch/drone_swarm.py
+++ /dev/null
@@ -1,376 +0,0 @@
-import torch
-from torch import nn, Tensor
-from dataclasses import dataclass
-from zeta.nn import FeedForward
-from typing import Any
-import torch.nn.functional as F
-
-
-OBST_COLOR_3 = (0.0, 0.5, 0.0)
-OBST_COLOR_4 = (0.0, 0.5, 0.0, 1.0)
-
-
-QUADS_OBS_REPR = {
-    "xyz_vxyz_R_omega": 18,
-    "xyz_vxyz_R_omega_floor": 19,
-    "xyz_vxyz_R_omega_wall": 24,
-}
-
-QUADS_NEIGHBOR_OBS_TYPE = {
-    "none": 0,
-    "pos_vel": 6,
-}
-
-QUADS_OBSTACLE_OBS_TYPE = {
-    "none": 0,
-    "octomap": 9,
-}
-
-
-@dataclass
-class OneHeadAttention(nn.Module):
-    """
-    OneHeadAttention module performs self-attention operation on input tensors.
-
-    Args:
-        dim (int): The dimension of the input tensors.
-
-    Attributes:
-        w_qs (nn.Linear): Linear layer for queries transformation.
-        w_ks (nn.Linear): Linear layer for keys transformation.
-        w_vs (nn.Linear): Linear layer for values transformation.
-        fc (nn.Linear): Linear layer for final transformation.
-        ln (nn.LayerNorm): Layer normalization for output.
-
-    Methods:
-        forward(q, k, v): Performs forward pass of the self-attention operation.
-
-    """
-
-    dim: int
-
-    def __post_init_(self):
-        self.w_qs = nn.Linear(self.dim, self.dim, bias=False)
-        self.w_ks = nn.Linear(self.dim, self.dim, bias=False)
-        self.w_vs = nn.Linear(self.dim, self.dim, bias=False)
-
-        self.fc = nn.Linear(self.dim, self.dim, bias=False)
-        self.ln = nn.LayerNorm(self.dim, eps=1e-6)
-
-    def forward(self, q, k, v):
-        """
-        Performs forward pass of the self-attention operation.
-
-        Args:
-            q (torch.Tensor): The query tensor.
-            k (torch.Tensor): The key tensor.
-            v (torch.Tensor): The value tensor.
-
-        Returns:
-            q (torch.Tensor): The output tensor after self-attention operation.
-            attn (torch.Tensor): The attention weights.
-
-        """
-        residual = q
-
-        # Pre attn ops
-        q = self.w_qs(q)
-        k = self.w_ks(k)
-        v = self.w_vs(v)
-
-        # Compute attention weights using queries and keys
-        attn = torch.matmul(q / (self.dim**-0.5), k.tranpose(-1, -2))
-        attn = F.softmax(attn, dim=-1)
-        q = torch.matmul(attn, v)
-        q = self.fc(q)
-        q += residual
-        q = self.ln(q)
-        return q, attn
-
-
-def estimate_neuron_score(act):
-    reduce_axes = list(range(act.dim() - 1))
-    score = torch.mean(torch.abs(act), dim=reduce_axes)
-    return score
-
-
-@dataclass
-class SwarmNeighborhoodEncoder(nn.Module):
-    """
-    A class representing the encoder for swarm neighborhood observations.
-
-    Args:
-        self_obs_dim (int): The dimension of the self-observation.
-        neighbor_obs_dim (int): The dimension of the neighbor observations.
-        neighbor_hidden_size (int): The hidden size of the neighbor encoder.
-        num_use_neighbor_obs (int): The number of neighbor observations to use.
-    """
-
-    self_obs_dim: int
-    neighbor_obs_dim: int
-    neighbor_hidden_size: int
-    num_use_neighbor_obs: int
-
-
-@dataclass
-class SwarmNeighborhoodEncoderDeepsets(SwarmNeighborhoodEncoder):
-    neighbor_obs_dim: int
-    neighbor_hidden_size: int
-    self_obs_dim: int
-    num_use_neighbor_obs: int
-    mult: int = 4
-    args: dict = None
-
-    def __post_init__(self):
-        self.ffn = FeedForward(
-            self.neighbor_obs_dim,
-            self.neighbor_hidden_size,
-            self.mult,
-            self.args,
-        )
-
-    def forward(
-        self,
-        self_obs: Tensor,
-        obs: Tensor,
-        all_neighbor_obs_size: int,
-        batch: int,
-    ) -> Tensor:
-        """
-        Forward pass of the SwarmNeighborhoodEncoder.
-
-        Args:
-            self_obs (Tensor): Self observation tensor.
-            obs (Tensor): Observation tensor.
-            all_neighbor_obs_size (int): Size of all neighbor observations.
-            batch (int): Batch size.
-
-        Returns:
-            Tensor: Mean embedding tensor.
-        """
-        obs_neighbors = obs[
-            :, self.self_obs_dim : self.self_obs_dim + all_neighbor_obs_size
-        ]
-        obs_neighbors = obs_neighbors.reshape(-1, self.neighbor_obs_dim)
-        neighbor_embeds = self.embedding_mlp(obs_neighbors)
-        neighbor_embeds = neighbor_embeds.reshape(
-            batch, -1, self.neighbor_hidden_size
-        )
-        mean_embed = torch.mean(neighbor_embeds, dim=1)
-        return mean_embed
-
-
-@dataclass
-class SwarmNeighborhoodEncoderAttention(SwarmNeighborhoodEncoder):
-    """
-    A class that represents a swarm neighborhood encoder with attention mechanism.
-
-    Args:
-        neighbor_obs_dim (int): The dimension of the neighbor observations.
-        neighbor_hidden_size (int): The hidden size of the neighbor encoder.
-        self_obs_dim (int): The dimension of the self observations.
-        num_use_neighbor_obs (int): The number of neighbor observations to use.
-        mult (int, optional): The multiplier for the hidden size in the MLPs. Defaults to 4.
-        args (dict, optional): Additional arguments for the MLPs. Defaults to None.
-    """
-
-    neighbor_obs_dim: int
-    neighbor_hidden_size: int
-    self_obs_dim: int
-    num_use_neighbor_obs: int
-    mult: int = 4
-    args: dict = None
-
-    def __post_init__(self):
-        self.embedding_mlp = FeedForward(
-            self.self_obs_dim + self.neighbor_obs_dim,
-            self.neighbor_hidden_size,
-            self.mult,
-            self.args,
-        )
-
-        self.neighbor_value_mlp = FeedForward(
-            self.neighbor_hidden_size,
-            self.neighbor_hidden_size,
-            self.mult,
-            self.args,
-        )
-
-        # Outputs scalar score alpha_i for each neighbor
-        self.attention_mlp = FeedForward(
-            self.neighbor_hidden_size * 2,
-            self.neighbor_hidden_size,
-            self.mult,
-            self.args,
-        )
-
-    def forward(
-        self,
-        self_obs: Tensor,
-        obs: Tensor,
-        all_neighbor_obs_size: int,
-        batch_size: int,
-    ) -> Tensor:
-        obs_neighbors = obs[
-            :, self.self_obs_dim : self.self_obs_dim + all_neighbor_obs_size
-        ]
-        obs_neighbors = obs_neighbors.reshape(-1, self.neighbor_obs_dim)
-
-        # Concat self observation with neighbor observation
-        self_obs_repeat = self_obs.repeat(self.num_use_neighbor_obs, 1)
-        mlp_input = torch.cat((self_obs_repeat, obs_neighbors), dim=1)
-        neighbor_embeddings = self.embedding_mlp(mlp_input)
-        neighbor_values = self.neighbor_value_mlp(neighbor_embeddings)
-        neighbor_embeddings_mean_input = neighbor_embeddings.reshape(
-            batch_size, -1, self.neighbor_hidden_size
-        )
-        neighbor_embeddings_mean = torch.mean(
-            neighbor_embeddings_mean_input, dim=1
-        )
-        neighbor_embeddings_mean_repeat = neighbor_embeddings_mean.repeat(
-            self.num_use_neighbor_obs, 1
-        )
-        attention_mlp_input = torch.cat(
-            (neighbor_embeddings, neighbor_embeddings_mean_repeat), dim=1
-        )
-        attention_weights = self.attention_mlp(attention_mlp_input).view(
-            batch_size, -1
-        )
-        attention_weights_softmax = torch.nn.functional.softmax(
-            attention_weights, dim=1
-        )
-        attention_weights_softmax = attention_weights_softmax.view(-1, 1)
-
-        final_neighbor_embedding = attention_weights_softmax * neighbor_values
-        final_neighbor_embedding = final_neighbor_embedding.view(
-            batch_size, -1, self.neighbor_hidden_size
-        )
-        final_neighbor_embedding = torch.sum(final_neighbor_embedding, dim=1)
-
-        return final_neighbor_embedding
-
-
-@dataclass
-class SwarmNeighborEncoderMLP(SwarmNeighborhoodEncoder):
-    """
-    A class representing a multi-layer perceptron (MLP) encoder for swarm neighbor observations.
-
-    Args:
-        neighbor_obs_dim (int): The dimension of each neighbor observation.
-        neighbor_hidden_size (int): The size of the hidden layer in the MLP.
-        self_obs_dim (int): The dimension of the self observation.
-        num_use_neighbor_obs (int): The number of neighbor observations to use.
-        mult (int, optional): The multiplier for the hidden layer size. Defaults to 4.
-        args (dict, optional): Additional arguments for the MLP. Defaults to None.
-    """
-
-    neighbor_obs_dim: int
-    neighbor_hidden_size: int
-    self_obs_dim: int
-    num_use_neighbor_obs: int
-    mult: int = 4
-    args: dict = None
-
-    def __post_init__(self):
-        """
-        Initialize the MLP encoder.
-
-        This method creates an MLP with the specified dimensions and parameters.
-        """
-        self.neighbor_mlp = FeedForward(
-            self.neighbor_obs_dim * self.num_use_neighbor_obs,
-            self.neighbor_hidden_size,
-            self.mult,
-            self.args,
-        )
-
-    def forward(
-        self,
-        self_obs: Tensor,
-        obs: Tensor,
-        all_neighbor_obs_size: int,
-        batch_size: int,
-    ) -> Tensor:
-        """
-        Perform a forward pass through the MLP encoder.
-
-        Args:
-            self_obs (Tensor): The self observation tensor.
-            obs (Tensor): The observation tensor.
-            all_neighbor_obs_size (int): The size of all neighbor observations.
-            batch_size (int): The size of the batch.
-
-        Returns:
-            Tensor: The final neighborhood embedding tensor.
-        """
-        obs_neighbors = obs[
-            :, self.self_obs_dim : self.self_obs_dim + all_neighbor_obs_size
-        ]
-        final_neighborhood_embedding = self.neighbor_mlp(obs_neighbors)
-        return final_neighborhood_embedding
-
-
-@dataclass
-class SwarmMultiHeadAttentionEncoder(nn.Module):
-    dim: int
-
-
-@dataclass
-class QuadSingleHeadAttentionEncoderSim2Real(SwarmMultiHeadAttentionEncoder):
-    obs_space: int
-    quads_obs_repr: Any
-    neighbor_hidden_size: int
-    quads_neighbor_hidden_size: int
-    use_obstacles: Any
-    quads_use_obstacles: Any
-    quads_neighbor_visible_num: int
-    num_use_neighbor_obs: int
-    quads_num_agents: int
-    quads_neighbor_obs_type: Any
-    rnn_size: int
-
-    def __post_init__(self):
-        if self.quads_obs_repr in QUADS_OBS_REPR:
-            self.self_obs_dim = QUADS_OBS_REPR[self.quads_obs_repr]
-        else:
-            raise NotImplementedError(
-                f"Unknown observation representation {self.quads_obs_repr}"
-            )
-
-        self.neighborbor_hidden_size = self.quads_neighbor_hidden_size
-        self.use_obstacles = self.quads_use_obstacles
-
-        if self.quads_neighbor_visible_num == 1:
-            self.num_use_neighbor_obs = self.quads_num_agents - 1
-        else:
-            self.num_use_neighbor_obs = self.quads_neighbor_visible_num
-
-        self.neighbor_obs_dim = QUADS_NEIGHBOR_OBS_TYPE[
-            self.quads_neighbor_obs_type
-        ]
-        self.all_neighbor_obs_dim = (
-            self.neighbor_obs_dim * self.num_use_neighbor_obs
-        )
-
-        self.self_embed_layer = nn.Sequential(
-            nn.Linear(self.self_obs_dim, self.rnn_size),
-            nn.ReLU(),
-        )
-        self.neighbor_embed_layer = nn.Sequential(
-            nn.Linear(self.all_neighbor_obs_dim, self.rnn_size),
-            nn.ReLU(),
-        )
-        self.obstacle_obs_dim = QUADS_OBSTACLE_OBS_TYPE[
-            self.quads_obstacle_obs_type
-        ]
-        self.obstacle_embed_layer = nn.Sequential(
-            nn.Linear(self.obstacle_obs_dim, self.rnn_size),
-            nn.ReLU(),
-        )
-        self.attn = OneHeadAttention(self.rnn_size)
-        self.encoder_output_size = self.rnn_size
-
-        self.ffn = FeedForward(
-            3 * self.rnn_size,
-            self.encoder_output_size,
-        )
diff --git a/swarms_torch/firefly.py b/swarms_torch/firefly.py
index b5cc063..71a9172 100644
--- a/swarms_torch/firefly.py
+++ b/swarms_torch/firefly.py
@@ -4,9 +4,10 @@
 import torch
 from loguru import logger
 from torch import Tensor
+from torch import nn
 
 
-class FireflyOptimizer:
+class FireflyOptimizer(nn.Module):
     def __init__(
         self,
         cost_function: Callable[[Tensor], Tensor],
diff --git a/swarms_torch/mas_model.py b/swarms_torch/mas_model.py
new file mode 100644
index 0000000..dfdbfc9
--- /dev/null
+++ b/swarms_torch/mas_model.py
@@ -0,0 +1,200 @@
+from typing import List, Dict, Any
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from loguru import logger
+
+# Set up logger
+logger.add("masi_log.log", rotation="500 MB")
+
+# Define device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger.info(f"Using device: {device}")
+
+
+# Agent Base Class
+class Agent(nn.Module):
+    def __init__(self):
+        super(Agent, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    # def backward(self, loss: torch.Tensor) -> None:
+    #     loss.backward()
+
+    def update_parameters(
+        self, shared_gradients: Dict[str, torch.Tensor]
+    ) -> None:
+        with torch.no_grad():
+            for name, param in self.named_parameters():
+                if param.grad is not None:
+                    param.grad = shared_gradients[name]
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+
+
+# MLP Agent
+class MLPAgent(Agent):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int):
+        super(MLPAgent, self).__init__()
+        self.model = nn.Sequential(
+            nn.Flatten(),  # Add this line to flatten the input
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, output_size),
+        )
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"MLPAgent input shape: {x.shape}")
+        output = self.model(x)
+        logger.debug(f"MLPAgent output shape: {output.shape}")
+        return output
+
+
+# CNN Agent
+class CNNAgent(Agent):
+    def __init__(self, input_channels: int, num_classes: int):
+        super(CNNAgent, self).__init__()
+        self.model = nn.Sequential(
+            nn.Conv2d(input_channels, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Flatten(),
+            nn.Linear(16 * 28 * 28, num_classes),
+        )
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"CNNAgent input shape: {x.shape}")
+        output = self.model(x)
+        logger.debug(f"CNNAgent output shape: {output.shape}")
+        return output
+
+
+# LSTM Agent
+class LSTMAgent(Agent):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int):
+        super(LSTMAgent, self).__init__()
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.fc = nn.Linear(hidden_size, output_size)
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"LSTMAgent input shape: {x.shape}")
+        # Reshape input: (batch, channels, height, width) -> (batch, height, width * channels)
+        x = x.view(x.size(0), x.size(2), -1)
+        lstm_out, _ = self.lstm(x)
+        output = self.fc(lstm_out[:, -1, :])
+        logger.debug(f"LSTMAgent output shape: {output.shape}")
+        return output
+
+
+# Transformer Agent
+class TransformerAgent(Agent):
+    def __init__(
+        self, input_size: int, num_heads: int, num_layers: int, output_size: int
+    ):
+        super(TransformerAgent, self).__init__()
+        self.embedding = nn.Linear(input_size, 128)
+        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=num_heads)
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layer, num_layers=num_layers
+        )
+        self.fc = nn.Linear(128, output_size)
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"TransformerAgent input shape: {x.shape}")
+        # Reshape input: (batch, channels, height, width) -> (batch, height, width * channels)
+        x = x.view(x.size(0), x.size(2), -1)
+        x = self.embedding(x)
+        x = x.permute(1, 0, 2)  # (sequence_length, batch_size, embedding_dim)
+        transformer_out = self.transformer_encoder(x)
+        transformer_out = transformer_out.permute(
+            1, 0, 2
+        )  # Back to (batch_size, sequence_length, embedding_dim)
+        output = self.fc(transformer_out[:, -1, :])
+        logger.debug(f"TransformerAgent output shape: {output.shape}")
+        return output
+
+
+# Multi-Architecture Swarm Intelligence (MASI) class
+class MultiArchitectureSwarm(nn.Module):
+    def __init__(
+        self,
+        num_mlp_agents: int,
+        num_cnn_agents: int,
+        num_lstm_agents: int,
+        num_transformer_agents: int,
+        input_sizes: Dict[str, Any],
+        output_size: int,
+    ):
+        super(MultiArchitectureSwarm, self).__init__()
+
+        self.agents: List[Agent] = []
+
+        # Initialize MLP Agents
+        for _ in range(num_mlp_agents):
+            agent = MLPAgent(
+                input_size=input_sizes["mlp"]["input_size"],
+                hidden_size=input_sizes["mlp"]["hidden_size"],
+                output_size=output_size,
+            )
+            self.agents.append(agent)
+
+        # Initialize CNN Agents
+        for _ in range(num_cnn_agents):
+            agent = CNNAgent(
+                input_channels=input_sizes["cnn"]["input_channels"],
+                num_classes=output_size,
+            )
+            self.agents.append(agent)
+
+        # Initialize LSTM Agents
+        for _ in range(num_lstm_agents):
+            agent = LSTMAgent(
+                input_size=input_sizes["lstm"]["input_size"],
+                hidden_size=input_sizes["lstm"]["hidden_size"],
+                output_size=output_size,
+            )
+            self.agents.append(agent)
+
+        # Initialize Transformer Agents
+        for _ in range(num_transformer_agents):
+            agent = TransformerAgent(
+                input_size=input_sizes["transformer"]["input_size"],
+                num_heads=input_sizes["transformer"]["num_heads"],
+                num_layers=input_sizes["transformer"]["num_layers"],
+                output_size=output_size,
+            )
+            self.agents.append(agent)
+
+        logger.info(f"Initialized {len(self.agents)} agents.")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        agent_outputs = []
+
+        for agent in self.agents:
+            agent_output = agent(x)
+            agent_outputs.append(agent_output)
+
+        # Aggregate outputs (Simple averaging for now)
+        global_output = self.aggregate_agent_outputs(agent_outputs)
+
+        return global_output
+
+    def aggregate_agent_outputs(
+        self, agent_outputs: List[torch.Tensor]
+    ) -> torch.Tensor:
+        # Stack outputs and calculate mean
+        logger.debug(f"Aggregating outputs from {len(agent_outputs)} agents.")
+        stacked_outputs = torch.stack(agent_outputs)
+        logger.debug(f"Stacked outputs shape: {stacked_outputs.shape}")
+        global_output = torch.mean(stacked_outputs, dim=0)
+        logger.debug(f"Global output shape: {global_output.shape}")
+        return global_output
diff --git a/swarms_torch/queen_bee.py b/swarms_torch/queen_bee.py
index b0c8719..4671c67 100644
--- a/swarms_torch/queen_bee.py
+++ b/swarms_torch/queen_bee.py
@@ -1,7 +1,8 @@
 import torch
+from torch import nn
 
 
-class QueenBeeGa:
+class QueenBeeGa(nn.Module):
     """
     Queen Bee evolution for genetic algos
 
diff --git a/swarms_torch/spiral_optimization.py b/swarms_torch/spiral_optimization.py
index 82db8cb..f07cf2a 100644
--- a/swarms_torch/spiral_optimization.py
+++ b/swarms_torch/spiral_optimization.py
@@ -1,7 +1,8 @@
 import torch
+from torch import nn
 
 
-class SPO:
+class SPO(nn.Module):
     """
     Spiral Optimization (SPO) Algorithm in PyTorch.
 
diff --git a/swarms_torch/structs/basic_nn.py b/swarms_torch/structs/basic_nn.py
deleted file mode 100644
index e69de29..0000000
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..e7a9947
--- /dev/null
+++ b/test.py
@@ -0,0 +1,341 @@
+from typing import List, Dict, Any
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from loguru import logger
+
+# Set up logger
+logger.add("masi_log.log", rotation="500 MB")
+
+# Define device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger.info(f"Using device: {device}")
+
+
+# Agent Base Class
+class Agent(nn.Module):
+    def __init__(self):
+        super(Agent, self).__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    # def backward(self, loss: torch.Tensor) -> None:
+    #     loss.backward()
+
+    def update_parameters(
+        self, shared_gradients: Dict[str, torch.Tensor]
+    ) -> None:
+        with torch.no_grad():
+            for name, param in self.named_parameters():
+                if param.grad is not None:
+                    param.grad = shared_gradients[name]
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+
+
+# MLP Agent
+class MLPAgent(Agent):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int):
+        super(MLPAgent, self).__init__()
+        self.model = nn.Sequential(
+            nn.Flatten(),  # Add this line to flatten the input
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, output_size),
+        )
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"MLPAgent input shape: {x.shape}")
+        output = self.model(x)
+        logger.debug(f"MLPAgent output shape: {output.shape}")
+        return output
+
+
+# CNN Agent
+class CNNAgent(Agent):
+    def __init__(self, input_channels: int, num_classes: int):
+        super(CNNAgent, self).__init__()
+        self.model = nn.Sequential(
+            nn.Conv2d(input_channels, 16, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Flatten(),
+            nn.Linear(16 * 28 * 28, num_classes),
+        )
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"CNNAgent input shape: {x.shape}")
+        output = self.model(x)
+        logger.debug(f"CNNAgent output shape: {output.shape}")
+        return output
+
+
+# LSTM Agent
+class LSTMAgent(Agent):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int):
+        super(LSTMAgent, self).__init__()
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.fc = nn.Linear(hidden_size, output_size)
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"LSTMAgent input shape: {x.shape}")
+        # Reshape input: (batch, channels, height, width) -> (batch, height, width * channels)
+        x = x.view(x.size(0), x.size(2), -1)
+        lstm_out, _ = self.lstm(x)
+        output = self.fc(lstm_out[:, -1, :])
+        logger.debug(f"LSTMAgent output shape: {output.shape}")
+        return output
+
+
+# Transformer Agent
+class TransformerAgent(Agent):
+    def __init__(
+        self, input_size: int, num_heads: int, num_layers: int, output_size: int
+    ):
+        super(TransformerAgent, self).__init__()
+        self.embedding = nn.Linear(input_size, 128)
+        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=num_heads)
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layer, num_layers=num_layers
+        )
+        self.fc = nn.Linear(128, output_size)
+        self.to(device)
+        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        logger.debug(f"TransformerAgent input shape: {x.shape}")
+        # Reshape input: (batch, channels, height, width) -> (batch, height, width * channels)
+        x = x.view(x.size(0), x.size(2), -1)
+        x = self.embedding(x)
+        x = x.permute(1, 0, 2)  # (sequence_length, batch_size, embedding_dim)
+        transformer_out = self.transformer_encoder(x)
+        transformer_out = transformer_out.permute(
+            1, 0, 2
+        )  # Back to (batch_size, sequence_length, embedding_dim)
+        output = self.fc(transformer_out[:, -1, :])
+        logger.debug(f"TransformerAgent output shape: {output.shape}")
+        return output
+
+
+# Initialize Agents
+def initialize_agents(
+    num_mlp_agents: int,
+    num_cnn_agents: int,
+    num_lstm_agents: int,
+    num_transformer_agents: int,
+    input_sizes: Dict[str, Any],
+    output_size: int,
+) -> List[Agent]:
+    agents: List[Agent] = []
+
+    # MLP Agents
+    for _ in range(num_mlp_agents):
+        agent = MLPAgent(
+            input_size=input_sizes["mlp"]["input_size"],
+            hidden_size=input_sizes["mlp"]["hidden_size"],
+            output_size=output_size,
+        )
+        agents.append(agent)
+
+    # CNN Agents
+    for _ in range(num_cnn_agents):
+        agent = CNNAgent(
+            input_channels=input_sizes["cnn"]["input_channels"],
+            num_classes=output_size,
+        )
+        agents.append(agent)
+
+    # LSTM Agents
+    for _ in range(num_lstm_agents):
+        agent = LSTMAgent(
+            input_size=input_sizes["lstm"]["input_size"],
+            hidden_size=input_sizes["lstm"]["hidden_size"],
+            output_size=output_size,
+        )
+        agents.append(agent)
+
+    # Transformer Agents
+    for _ in range(num_transformer_agents):
+        agent = TransformerAgent(
+            input_size=input_sizes["transformer"]["input_size"],
+            num_heads=input_sizes["transformer"]["num_heads"],
+            num_layers=input_sizes["transformer"]["num_layers"],
+            output_size=output_size,
+        )
+        agents.append(agent)
+
+    logger.info(f"Initialized {len(agents)} agents.")
+    return agents
+
+
+# Aggregate Outputs
+def aggregate_agent_outputs(agent_outputs: List[torch.Tensor]) -> torch.Tensor:
+    # Simple average of outputs
+    logger.debug(f"Aggregating outputs from {len(agent_outputs)} agents.")
+    stacked_outputs = torch.stack(agent_outputs)
+    logger.debug(f"Stacked outputs shape: {stacked_outputs.shape}")
+    global_output = torch.mean(stacked_outputs, dim=0)
+    logger.debug(f"Global output shape: {global_output.shape}")
+    return global_output
+
+
+# Compute Loss
+def compute_loss(
+    global_output: torch.Tensor, targets: torch.Tensor
+) -> torch.Tensor:
+    criterion = nn.CrossEntropyLoss()
+    loss = criterion(global_output, targets)
+    logger.debug(f"Computed loss: {loss.item()}")
+    return loss
+
+
+# Compute Agent-specific Loss (Optional)
+def compute_agent_loss(agent: Agent, loss: torch.Tensor) -> torch.Tensor:
+    # For simplicity, all agents share the same loss
+    return loss
+
+
+# Aggregate Gradients
+def aggregate_gradients(agents: List[Agent]) -> Dict[str, torch.Tensor]:
+    # Average gradients across all agents
+    shared_gradients: Dict[str, torch.Tensor] = {}
+    num_agents = len(agents)
+    for name, param in agents[0].named_parameters():
+        if param.grad is not None:
+            shared_gradients[name] = param.grad.clone() / num_agents
+            for other_agent in agents[1:]:
+                shared_gradients[name] += (
+                    other_agent._parameters[name].grad.clone() / num_agents
+                )
+    logger.debug("Aggregated gradients.")
+    return shared_gradients
+
+
+# Evaluate Performance
+def evaluate_swarm_performance(
+    agents: List[Agent], validation_loader: DataLoader
+) -> None:
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in validation_loader:
+            inputs, labels = data
+            inputs, labels = inputs.to(device), labels.to(device)
+            agent_outputs = []
+            for agent in agents:
+                output = agent(inputs)
+                agent_outputs.append(output)
+            global_output = aggregate_agent_outputs(agent_outputs)
+            _, predicted = torch.max(global_output.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+    accuracy = 100 * correct / total
+    logger.info(f"Validation Accuracy: {accuracy:.2f}%")
+
+
+# Main Training Loop
+def train_swarm(
+    agents: List[Agent],
+    train_loader: DataLoader,
+    validation_loader: DataLoader,
+    num_epochs: int,
+    evaluation_interval: int,
+) -> None:
+    for epoch in range(num_epochs):
+        for i, data in enumerate(train_loader, 0):
+            inputs, targets = data
+            inputs, targets = inputs.to(device), targets.to(device)
+
+            agent_outputs = []
+            total_loss = 0
+
+            # Each agent processes the data
+            for agent in agents:
+                agent.optimizer.zero_grad()
+                agent_output = agent(inputs)
+                agent_outputs.append(agent_output)
+
+                # Compute individual agent loss
+                agent_loss = compute_loss(agent_output, targets)
+                total_loss += agent_loss.item()
+
+                # Backward pass and update for each agent
+                agent_loss.backward()
+                agent.optimizer.step()
+
+            # Aggregate outputs (for logging purposes)
+            global_output = aggregate_agent_outputs(agent_outputs)
+
+            # Log the average loss
+            avg_loss = total_loss / len(agents)
+            logger.debug(f"Batch [{i}] Average loss: {avg_loss:.4f}")
+
+        # Evaluate performance
+        if (epoch + 1) % evaluation_interval == 0:
+            logger.info(f"Epoch [{epoch + 1}/{num_epochs}]")
+            evaluate_swarm_performance(agents, validation_loader)
+
+
+# Example Usage
+if __name__ == "__main__":
+    # Hyperparameters
+    num_mlp_agents = 2
+    num_cnn_agents = 2
+    num_lstm_agents = 2
+    num_transformer_agents = 2
+    num_epochs = 10
+    evaluation_interval = 1
+    batch_size = 64
+    output_size = 10  # For example, number of classes in classification
+
+    # Input sizes for different agents
+    input_sizes = {
+        "mlp": {"input_size": 784, "hidden_size": 128},  # Example for MNIST
+        "cnn": {"input_channels": 1},
+        "lstm": {
+            "input_size": 28,
+            "hidden_size": 128,
+        },  # Sequence length for MNIST rows
+        "transformer": {"input_size": 28, "num_heads": 4, "num_layers": 2},
+    }
+
+    # Initialize agents
+    agents = initialize_agents(
+        num_mlp_agents,
+        num_cnn_agents,
+        num_lstm_agents,
+        num_transformer_agents,
+        input_sizes,
+        output_size,
+    )
+
+    # Load and preprocess data
+    from torchvision import datasets, transforms
+
+    transform = transforms.Compose([transforms.ToTensor()])
+
+    train_dataset = datasets.MNIST(
+        root="./data", train=True, download=True, transform=transform
+    )
+    validation_dataset = datasets.MNIST(
+        root="./data", train=False, download=True, transform=transform
+    )
+
+    train_loader = DataLoader(
+        train_dataset, batch_size=batch_size, shuffle=True
+    )
+    validation_loader = DataLoader(
+        validation_dataset, batch_size=batch_size, shuffle=False
+    )
+
+    # Train swarm
+    train_swarm(
+        agents, train_loader, validation_loader, num_epochs, evaluation_interval
+    )