From 5e2a9a96a8971b824e6f97333a45f2daff11da06 Mon Sep 17 00:00:00 2001 From: Jeet Mukherjee Date: Mon, 15 Sep 2025 19:23:17 +0530 Subject: [PATCH 1/6] Added Two New Questions(Probability) --- build/1.json | 12 +++-- build/174.json | 51 +++++++++++++++++++ build/2.json | 8 +-- build/3.json | 8 +-- .../description.md | 3 ++ .../example.json | 5 ++ .../learn.md | 24 +++++++++ .../meta.json | 15 ++++++ .../solution.py | 14 +++++ .../starter_code.py | 7 +++ .../tests.json | 18 +++++++ .../description.md | 16 ++++++ .../example.json | 5 ++ .../183_pmf_normalization_constant 2/learn.md | 16 ++++++ .../meta.json | 15 ++++++ .../solution.py | 14 +++++ .../starter_code.py | 6 +++ .../tests.json | 10 ++++ 18 files changed, 235 insertions(+), 12 deletions(-) create mode 100644 build/174.json create mode 100644 questions/182_empirical_probability_mass_function_(pmf)/description.md create mode 100644 questions/182_empirical_probability_mass_function_(pmf)/example.json create mode 100644 questions/182_empirical_probability_mass_function_(pmf)/learn.md create mode 100644 questions/182_empirical_probability_mass_function_(pmf)/meta.json create mode 100644 questions/182_empirical_probability_mass_function_(pmf)/solution.py create mode 100644 questions/182_empirical_probability_mass_function_(pmf)/starter_code.py create mode 100644 questions/182_empirical_probability_mass_function_(pmf)/tests.json create mode 100644 questions/183_pmf_normalization_constant 2/description.md create mode 100644 questions/183_pmf_normalization_constant 2/example.json create mode 100644 questions/183_pmf_normalization_constant 2/learn.md create mode 100644 questions/183_pmf_normalization_constant 2/meta.json create mode 100644 questions/183_pmf_normalization_constant 2/solution.py create mode 100644 questions/183_pmf_normalization_constant 2/starter_code.py create mode 100644 questions/183_pmf_normalization_constant 2/tests.json diff --git a/build/1.json b/build/1.json index ad432627..09ddad8e 100644 --- a/build/1.json +++ b/build/1.json @@ -37,16 +37,20 @@ "expected_output": "[5.5, 10.0]" } ], - "tinygrad_starter_code": "from tinygrad.tensor import Tensor\n\ndef matrix_dot_vector_tg(a, b) -> Tensor:\n \"\"\"\n Compute the product of matrix `a` and vector `b` using tinygrad.\n Inputs can be Python lists, NumPy arrays, or tinygrad Tensors.\n Returns a 1-D Tensor of length m, or Tensor(-1) if dimensions mismatch.\n \"\"\"\n # Dimension mismatch check\n if len(a[0]) != len(b):\n return Tensor(-1)\n # Convert to Tensor\n a_t = Tensor(a)\n b_t = Tensor(b)\n # Your implementation here\n pass", - "tinygrad_solution": "from tinygrad.tensor import Tensor\n\ndef matrix_dot_vector_tg(a, b) -> Tensor:\n \"\"\"\n Compute the product of matrix `a` and vector `b` using tinygrad.\n Inputs can be Python lists, NumPy arrays, or tinygrad Tensors.\n Returns a 1-D Tensor of length m, or Tensor(-1) if dimensions mismatch.\n \"\"\"\n if len(a[0]) != len(b):\n return Tensor(-1)\n a_t = Tensor(a)\n b_t = Tensor(b)\n return a_t.matmul(b_t)", + "tinygrad_starter_code": "from tinygrad.tensor import Tensor\n\ndef matrix_dot_vector_tg(a:Tensor, b:Tensor) -> Tensor:\n \"\"\"\n Compute the product of matrix `a` and vector `b` using tinygrad.\n Will be tinygrad Tensors.\n Returns a 1-D Tensor of length m, or Tensor(-1) if dimensions mismatch.\n \"\"\"\n pass", + "tinygrad_solution": "from tinygrad.tensor import Tensor\n\ndef matrix_dot_vector_tg(a: Tensor, b: Tensor) -> Tensor:\n \"\"\"\n Compute the product of matrix `a` and vector `b` using tinygrad.\n Inputs will be tinygrad Tensors.\n Returns a 1-D Tensor of length m, or Tensor(-1) if dimensions mismatch.\n \"\"\"\n if len(a[0]) != len(b):\n return Tensor(-1)\n return a @ b", "tinygrad_test_cases": [ { - "test": "from tinygrad.tensor import Tensor\nres = matrix_dot_vector_tg(\n [[1,2,3],[2,4,5],[6,8,9]],\n [1,2,3]\n)\nprint(res.numpy().tolist())", + "test": "from tinygrad.tensor import Tensor\nres = matrix_dot_vector_tg(\n Tensor([[1,2,3],[2,4,5],[6,8,9]]),\n Tensor([1,2,3])\n)\nprint(res.numpy().tolist())", "expected_output": "[14.0, 25.0, 49.0]" }, { - "test": "from tinygrad.tensor import Tensor\nres = matrix_dot_vector_tg(\n [[1,2,3],[2,4,5]],\n [1,2]\n)\nprint(res.numpy().tolist())", + "test": "from tinygrad.tensor import Tensor\nres = matrix_dot_vector_tg(\n Tensor([[1,2,3],[2,4,5]]),\n Tensor([1,2])\n)\nprint(res.numpy().tolist())", "expected_output": "-1" + }, + { + "test": "from tinygrad.tensor import Tensor\nres = matrix_dot_vector_tg(\n Tensor([[1, 2], [2, 4]]),\n Tensor([1, 2])\n)\nprint(res.numpy().tolist())", + "expected_output": "[5, 10]" } ], "pytorch_starter_code": "import torch\n\ndef matrix_dot_vector(a, b) -> torch.Tensor:\n \"\"\"\n Compute the product of matrix `a` and vector `b` using PyTorch.\n Inputs can be Python lists, NumPy arrays, or torch Tensors.\n Returns a 1-D tensor of length m, or tensor(-1) if dimensions mismatch.\n \"\"\"\n a_t = torch.as_tensor(a, dtype=torch.float)\n b_t = torch.as_tensor(b, dtype=torch.float)\n # Dimension mismatch check\n if a_t.size(1) != b_t.size(0):\n return torch.tensor(-1)\n # Your implementation here\n pass", diff --git a/build/174.json b/build/174.json new file mode 100644 index 00000000..7e51f036 --- /dev/null +++ b/build/174.json @@ -0,0 +1,51 @@ +{ + "id": "174", + "title": "Train a Simple GAN on 1D Gaussian Data", + "difficulty": "hard", + "category": "Deep Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/moe18", + "name": "moe" + } + ], + "pytorch_difficulty": "medium", + "description": "In this task, you will train a Generative Adversarial Network (GAN) to learn a one-dimensional Gaussian distribution. The GAN consists of a generator that produces samples from latent noise and a discriminator that estimates the probability that a given sample is real. Both networks should have one hidden layer with ReLU activation in the hidden layer. The generator’s output layer is linear, while the discriminator's output layer uses a sigmoid activation.\n\nYou must train the GAN using the standard non-saturating GAN loss for the generator and binary cross-entropy loss for the discriminator. In the NumPy version, parameters should be updated using vanilla gradient descent. In the PyTorch version, parameters should be updated using stochastic gradient descent (SGD) with the specified learning rate. The training loop should alternate between updating the discriminator and the generator each iteration.\n\nYour function must return the trained generator forward function `gen_forward(z)`, which produces generated samples given latent noise.", + "learn_section": "## Understanding GANs for 1D Gaussian Data\nA Generative Adversarial Network (GAN) consists of two neural networks - a **Generator** $G_\\theta$ and a **Discriminator** $D_\\phi$ - trained in a minimax game.\n\n### 1. The Roles\n- **Generator** $G_\\theta(z)$: Takes a latent noise vector $z \\sim \\mathcal{N}(0, I)$ and outputs a sample intended to resemble the real data.\n- **Discriminator** $D_\\phi(x)$: Outputs a probability $p \\in (0, 1)$ that the input $x$ came from the real data distribution rather than the generator.\n\n### 2. The Objective\nThe classical GAN objective is:\n$$\n\\min_{\\theta} \\; \\max_{\\phi} \\; \\mathbb{E}_{x \\sim p_{\\text{data}}} [\\log D_\\phi(x)] + \\mathbb{E}_{z \\sim p(z)} [\\log (1 - D_\\phi(G_\\theta(z)))]\n$$\nHere:\n- $p_{\\text{data}}$ is the real data distribution.\n- $p(z)$ is the prior distribution for the latent noise (often standard normal).\n\n### 3. Practical Losses\nIn implementation, we minimize:\n- **Discriminator loss**:\n$$\n\\mathcal{L}_D = - \\left( \\frac{1}{m} \\sum_{i=1}^m \\log D(x^{(i)}_{\\text{real}}) + \\log(1 - D(x^{(i)}_{\\text{fake}})) \\right)\n$$\n- **Generator loss** (non-saturating form):\n$$\n\\mathcal{L}_G = - \\frac{1}{m} \\sum_{i=1}^m \\log D(G(z^{(i)}))\n$$\n\n### 4. Forward/Backward Flow\n1. **Discriminator step**: Real samples $x_{\\text{real}}$ and fake samples $x_{\\text{fake}} = G(z)$ are passed through $D$, and $\\mathcal{L}_D$ is minimized w.r.t. $\\phi$.\n2. **Generator step**: Fresh $z$ is sampled, $x_{\\text{fake}} = G(z)$ is passed through $D$, and $\\mathcal{L}_G$ is minimized w.r.t. $\\theta$ while keeping $\\phi$ fixed.\n\n### 5. Architecture for This Task\n- **Generator**: Fully connected layer ($\\mathbb{R}^{\\text{latent\\_dim}} \\to \\mathbb{R}^{\\text{hidden\\_dim}}$) -> ReLU -> Fully connected layer ($\\mathbb{R}^{\\text{hidden\\_dim}} \\to \\mathbb{R}^1$).\n- **Discriminator**: Fully connected layer ($\\mathbb{R}^1 \\to \\mathbb{R}^{\\text{hidden\\_dim}}$) → ReLU → Fully connected layer ($\\mathbb{R}^{\\text{hidden\\_dim}} \\to \\mathbb{R}^1$) → Sigmoid.\n\n### 6. Numerical Tips\n- Initialize weights with a small Gaussian ($\\mathcal{N}(0, 0.01)$).\n- Add $10^{-8}$ to logs for numerical stability.\n- Use a consistent batch size $m$ for both real and fake samples.\n- Always sample fresh noise for the generator on each update.\n\n**Your Task**: Implement the training loop to learn the parameters $\\theta$ and $\\phi$, and return the trained `gen_forward(z)` function. The evaluation (mean/std of generated samples) will be handled in the test cases.", + "starter_code": "import numpy as np\n\ndef train_gan(mean_real: float, std_real: float, latent_dim: int = 1, hidden_dim: int = 16, learning_rate: float = 0.001, epochs: int = 5000, batch_size: int = 128, seed: int = 42):\n \"\"\"\n Train a simple GAN to learn a 1D Gaussian distribution.\n\n Args:\n mean_real: Mean of the target Gaussian\n std_real: Std of the target Gaussian\n latent_dim: Dimension of the noise input to the generator\n hidden_dim: Hidden layer size for both networks\n learning_rate: Learning rate for gradient descent\n epochs: Number of training epochs\n batch_size: Training batch size\n seed: Random seed for reproducibility\n\n Returns:\n gen_forward: A function that takes z and returns generated samples\n \"\"\"\n # Your code here\n pass", + "solution": "import numpy as np\n\ndef relu(x):\n return np.maximum(0, x)\n\ndef sigmoid(x):\n return 1 / (1 + np.exp(-x))\n\ndef train_gan(mean_real: float, std_real: float, latent_dim: int = 1, hidden_dim: int = 16, learning_rate: float = 0.001, epochs: int = 5000, batch_size: int = 128, seed: int = 42):\n np.random.seed(seed)\n data_dim = 1\n\n # Initialize generator weights\n w1_g = np.random.normal(0, 0.01, (latent_dim, hidden_dim))\n b1_g = np.zeros(hidden_dim)\n w2_g = np.random.normal(0, 0.01, (hidden_dim, data_dim))\n b2_g = np.zeros(data_dim)\n\n # Initialize discriminator weights\n w1_d = np.random.normal(0, 0.01, (data_dim, hidden_dim))\n b1_d = np.zeros(hidden_dim)\n w2_d = np.random.normal(0, 0.01, (hidden_dim, 1))\n b2_d = np.zeros(1)\n\n def disc_forward(x):\n h1 = np.dot(x, w1_d) + b1_d\n a1 = relu(h1)\n logit = np.dot(a1, w2_d) + b2_d\n p = sigmoid(logit)\n return p, logit, a1, h1\n\n def gen_forward(z):\n h1 = np.dot(z, w1_g) + b1_g\n a1 = relu(h1)\n x_gen = np.dot(a1, w2_g) + b2_g\n return x_gen, a1, h1\n\n for epoch in range(epochs):\n # Sample real data\n x_real = np.random.normal(mean_real, std_real, batch_size)[:, None]\n z = np.random.normal(0, 1, (batch_size, latent_dim))\n x_fake, _, _ = gen_forward(z)\n\n # Discriminator forward\n p_real, _, a1_real, h1_real = disc_forward(x_real)\n p_fake, _, a1_fake, h1_fake = disc_forward(x_fake)\n\n # Discriminator gradients\n grad_logit_real = - (1 - p_real) / batch_size\n grad_a1_real = grad_logit_real @ w2_d.T\n grad_h1_real = grad_a1_real * (h1_real > 0)\n grad_w1_d_real = x_real.T @ grad_h1_real\n grad_b1_d_real = np.sum(grad_h1_real, axis=0)\n grad_w2_d_real = a1_real.T @ grad_logit_real\n grad_b2_d_real = np.sum(grad_logit_real, axis=0)\n\n grad_logit_fake = p_fake / batch_size\n grad_a1_fake = grad_logit_fake @ w2_d.T\n grad_h1_fake = grad_a1_fake * (h1_fake > 0)\n grad_w1_d_fake = x_fake.T @ grad_h1_fake\n grad_b1_d_fake = np.sum(grad_h1_fake, axis=0)\n grad_w2_d_fake = a1_fake.T @ grad_logit_fake\n grad_b2_d_fake = np.sum(grad_logit_fake, axis=0)\n\n grad_w1_d = grad_w1_d_real + grad_w1_d_fake\n grad_b1_d = grad_b1_d_real + grad_b1_d_fake\n grad_w2_d = grad_w2_d_real + grad_w2_d_fake\n grad_b2_d = grad_b2_d_real + grad_b2_d_fake\n\n w1_d -= learning_rate * grad_w1_d\n b1_d -= learning_rate * grad_b1_d\n w2_d -= learning_rate * grad_w2_d\n b2_d -= learning_rate * grad_b2_d\n\n # Generator update\n z = np.random.normal(0, 1, (batch_size, latent_dim))\n x_fake, a1_g, h1_g = gen_forward(z)\n p_fake, _, a1_d, h1_d = disc_forward(x_fake)\n\n grad_logit_fake = - (1 - p_fake) / batch_size\n grad_a1_d = grad_logit_fake @ w2_d.T\n grad_h1_d = grad_a1_d * (h1_d > 0)\n grad_x_fake = grad_h1_d @ w1_d.T\n\n grad_a1_g = grad_x_fake @ w2_g.T\n grad_h1_g = grad_a1_g * (h1_g > 0)\n grad_w1_g = z.T @ grad_h1_g\n grad_b1_g = np.sum(grad_h1_g, axis=0)\n grad_w2_g = a1_g.T @ grad_x_fake\n grad_b2_g = np.sum(grad_x_fake, axis=0)\n\n w1_g -= learning_rate * grad_w1_g\n b1_g -= learning_rate * grad_b1_g\n w2_g -= learning_rate * grad_w2_g\n b2_g -= learning_rate * grad_b2_g\n\n return gen_forward", + "example": { + "input": "gen_forward = train_gan(4.0, 1.25, epochs=1000, seed=42)\nz = np.random.normal(0, 1, (500, 1))\nx_gen, _, _ = gen_forward(z)\n(round(np.mean(x_gen), 4), round(np.std(x_gen), 4))", + "output": "(0.0004, 0.0002)", + "reasoning": "The test cases call `gen_forward` after training, sample 500 points, and then compute the mean and std." + }, + "test_cases": [ + { + "test": "gen_forward = train_gan(4.0, 1.25, epochs=1000, seed=42)\nz = np.random.normal(0, 1, (500, 1))\nx_gen, _, _ = gen_forward(z)\nprint((round(np.mean(x_gen), 4), round(np.std(x_gen), 4)))", + "expected_output": "(0.0004, 0.0002)" + }, + { + "test": "gen_forward = train_gan(0.0, 1.0, epochs=500, seed=0)\nz = np.random.normal(0, 1, (300, 1))\nx_gen, _, _ = gen_forward(z)\nprint((round(np.mean(x_gen), 4), round(np.std(x_gen), 4)))", + "expected_output": "(-0.0002, 0.0002)" + }, + { + "test": "gen_forward = train_gan(-2.0, 0.5, epochs=1500, seed=123)\nz = np.random.normal(0, 1, (400, 1))\nx_gen, _, _ = gen_forward(z)\nprint((round(np.mean(x_gen), 4), round(np.std(x_gen), 4)))", + "expected_output": "(-0.0044, 0.0002)" + } + ], + "pytorch_starter_code": "import torch\nimport torch.nn as nn\nimport torch.optim as optim\n\ndef train_gan(mean_real: float, std_real: float, latent_dim: int = 1, hidden_dim: int = 16, learning_rate: float = 0.001, epochs: int = 5000, batch_size: int = 128, seed: int = 42):\n torch.manual_seed(seed)\n # Your PyTorch implementation here\n pass", + "pytorch_solution": "import torch\nimport torch.nn as nn\nimport torch.optim as optim\n\ndef train_gan(mean_real: float, std_real: float, latent_dim: int = 1, hidden_dim: int = 16, learning_rate: float = 0.001, epochs: int = 5000, batch_size: int = 128, seed: int = 42):\n torch.manual_seed(seed)\n\n class Generator(nn.Module):\n def __init__(self):\n super().__init__()\n self.net = nn.Sequential(\n nn.Linear(latent_dim, hidden_dim),\n nn.ReLU(),\n nn.Linear(hidden_dim, 1)\n )\n def forward(self, z):\n return self.net(z)\n\n class Discriminator(nn.Module):\n def __init__(self):\n super().__init__()\n self.net = nn.Sequential(\n nn.Linear(1, hidden_dim),\n nn.ReLU(),\n nn.Linear(hidden_dim, 1),\n nn.Sigmoid()\n )\n def forward(self, x):\n return self.net(x)\n\n G = Generator()\n D = Discriminator()\n\n # Use SGD as requested\n opt_G = optim.SGD(G.parameters(), lr=learning_rate)\n opt_D = optim.SGD(D.parameters(), lr=learning_rate)\n criterion = nn.BCELoss()\n\n for _ in range(epochs):\n # Real and fake batches\n real_data = torch.normal(mean_real, std_real, size=(batch_size, 1))\n noise = torch.randn(batch_size, latent_dim)\n fake_data = G(noise)\n\n # ----- Discriminator step -----\n opt_D.zero_grad()\n pred_real = D(real_data)\n pred_fake = D(fake_data.detach())\n loss_real = criterion(pred_real, torch.ones_like(pred_real))\n loss_fake = criterion(pred_fake, torch.zeros_like(pred_fake))\n loss_D = loss_real + loss_fake\n loss_D.backward()\n opt_D.step()\n\n # ----- Generator step -----\n opt_G.zero_grad()\n pred_fake = D(fake_data)\n # non-saturating generator loss: maximize log D(G(z)) -> minimize -log D(G(z))\n loss_G = criterion(pred_fake, torch.ones_like(pred_fake))\n loss_G.backward()\n opt_G.step()\n\n return G.forward", + "pytorch_test_cases": [ + { + "test": "gen_forward = train_gan(4.0, 1.25, epochs=100, seed=42)\nz = torch.randn(500, 1)\nx_gen = gen_forward(z)\nprint((round(x_gen.mean().item(), 4), round(x_gen.std().item(), 4)))", + "expected_output": "(0.4725, 0.3563)" + }, + { + "test": "gen_forward = train_gan(0.0, 1.0, epochs=50, seed=0)\nz = torch.randn(300, 1)\nx_gen = gen_forward(z)\nprint((round(x_gen.mean().item(), 4), round(x_gen.std().item(), 4)))", + "expected_output": "(0.0644, 0.244)" + } + ] +} \ No newline at end of file diff --git a/build/2.json b/build/2.json index 8faf4c09..e42cf6db 100644 --- a/build/2.json +++ b/build/2.json @@ -33,15 +33,15 @@ "expected_output": "[[1, 4], [2, 5], [3, 6]]" } ], - "tinygrad_starter_code": "from tinygrad.tensor import Tensor\n\ndef transpose_matrix_tg(a) -> Tensor:\n \"\"\"\n Transpose a 2D matrix `a` using tinygrad.\n Inputs can be Python lists, NumPy arrays, or tinygrad Tensors.\n Returns a transposed Tensor.\n \"\"\"\n # Convert to Tensor\n a_t = Tensor(a)\n # Your implementation here\n pass", - "tinygrad_solution": "from tinygrad.tensor import Tensor\n\ndef transpose_matrix_tg(a) -> Tensor:\n \"\"\"\n Transpose a 2D matrix `a` using tinygrad.\n Inputs can be Python lists, NumPy arrays, or tinygrad Tensors.\n Returns a transposed Tensor.\n \"\"\"\n a_t = Tensor(a)\n return a_t.transpose(0,1)", + "tinygrad_starter_code": "from tinygrad.tensor import Tensor\n\ndef transpose_matrix_tg(a:Tensor) -> Tensor:\n \"\"\"\n Transpose a 2D matrix `a` using tinygrad.\n Inputs are tinygrad Tensors.\n Returns a transposed Tensor.\n \"\"\"\n pass", + "tinygrad_solution": "from tinygrad.tensor import Tensor\n\ndef transpose_matrix_tg(a) -> Tensor:\n \"\"\"\n Transpose a 2D matrix `a` using tinygrad.\n Inputs are tinygrad Tensors.\n Returns a transposed Tensor.\n \"\"\"\n return a.T", "tinygrad_test_cases": [ { - "test": "from tinygrad.tensor import Tensor\nres = transpose_matrix_tg([[1,2,3],[4,5,6]])\nprint(res.numpy().tolist())", + "test": "from tinygrad.tensor import Tensor\nres = transpose_matrix_tg(Tensor([[1,2,3],[4,5,6]]))\nprint(res.numpy().tolist())", "expected_output": "[[1, 4], [2, 5], [3, 6]]" }, { - "test": "from tinygrad.tensor import Tensor\nres = transpose_matrix_tg([[1,2],[3,4]])\nprint(res.numpy().tolist())", + "test": "from tinygrad.tensor import Tensor\nres = transpose_matrix_tg(Tensor([[1,2],[3,4]]))\nprint(res.numpy().tolist())", "expected_output": "[[1, 3], [2, 4]]" } ], diff --git a/build/3.json b/build/3.json index 5aecd531..366f7b2a 100644 --- a/build/3.json +++ b/build/3.json @@ -46,15 +46,15 @@ "expected_output": "[[1, 2, 3, 4], [5, 6, 7, 8]]" } ], - "tinygrad_starter_code": "from tinygrad.tensor import Tensor\n\ndef reshape_matrix_tg(a, new_shape) -> Tensor:\n \"\"\"\n Reshape a 2D matrix `a` to shape `new_shape` using tinygrad.\n Inputs can be Python lists, NumPy arrays, or tinygrad Tensors.\n Returns a Tensor of shape `new_shape`, or an empty Tensor on mismatch.\n \"\"\"\n # Dimension check\n if len(a) * len(a[0]) != new_shape[0] * new_shape[1]:\n return Tensor([])\n # Convert to Tensor and reshape\n a_t = Tensor(a)\n # Your implementation here\n pass", - "tinygrad_solution": "from tinygrad.tensor import Tensor\n\ndef reshape_matrix_tg(a, new_shape) -> Tensor:\n \"\"\"\n Reshape a 2D matrix `a` to shape `new_shape` using tinygrad.\n Inputs can be Python lists, NumPy arrays, or tinygrad Tensors.\n Returns a Tensor of shape `new_shape`, or an empty Tensor on mismatch.\n \"\"\"\n # Dimension check\n if len(a) * len(a[0]) != new_shape[0] * new_shape[1]:\n return Tensor([])\n a_t = Tensor(a)\n return a_t.reshape(new_shape)", + "tinygrad_starter_code": "from tinygrad.tensor import Tensor\n\ndef reshape_matrix_tg(a:Tensor, new_shape:tuple) -> Tensor:\n \"\"\"\n Reshape a 2D matrix `a` to shape `new_shape` using tinygrad.\n Inputs are tinygrad Tensors.\n Returns a Tensor of shape `new_shape`, or an empty Tensor on mismatch.\n \"\"\"\n pass", + "tinygrad_solution": "from tinygrad.tensor import Tensor\n\ndef reshape_matrix_tg(a, new_shape) -> Tensor:\n \"\"\"\n Reshape a 2D matrix `a` to shape `new_shape` using tinygrad.\n Inputs are tinygrad Tensors.\n Returns a Tensor of shape `new_shape`, or an empty Tensor on mismatch.\n \"\"\"\n # Dimension check\n if len(a) * len(a[0]) != new_shape[0] * new_shape[1]:\n return Tensor([])\n return a.reshape(new_shape)", "tinygrad_test_cases": [ { - "test": "from tinygrad.tensor import Tensor\nres = reshape_matrix_tg(\n [[1,2,3],[4,5,6]],\n (3, 2)\n)\nprint(res.numpy().tolist())", + "test": "from tinygrad.tensor import Tensor\nres = reshape_matrix_tg(\n Tensor([[1,2,3],[4,5,6]]),\n (3, 2)\n)\nprint(res.numpy().tolist())", "expected_output": "[[1, 2], [3, 4], [5, 6]]" }, { - "test": "from tinygrad.tensor import Tensor\nres = reshape_matrix_tg(\n [[1,2],[3,4]],\n (3, 2)\n)\nprint(res.numpy().tolist())", + "test": "from tinygrad.tensor import Tensor\nres = reshape_matrix_tg(\n Tensor([[1,2],[3,4]]),\n (3, 2)\n)\nprint(res.numpy().tolist())", "expected_output": "[]" } ], diff --git a/questions/182_empirical_probability_mass_function_(pmf)/description.md b/questions/182_empirical_probability_mass_function_(pmf)/description.md new file mode 100644 index 00000000..9a49f526 --- /dev/null +++ b/questions/182_empirical_probability_mass_function_(pmf)/description.md @@ -0,0 +1,3 @@ +## Problem + +Given a list of integer samples drawn from a discrete distribution, implement a function to compute the empirical Probability Mass Function (PMF). The function should return a list of `(value, probability)` pairs sorted by the value in ascending order. If the input is empty, return an empty list. diff --git a/questions/182_empirical_probability_mass_function_(pmf)/example.json b/questions/182_empirical_probability_mass_function_(pmf)/example.json new file mode 100644 index 00000000..a00d5e97 --- /dev/null +++ b/questions/182_empirical_probability_mass_function_(pmf)/example.json @@ -0,0 +1,5 @@ +{ + "input": "samples = [1, 2, 2, 3, 3, 3]", + "output": "[(1, 0.16666666666666666), (2, 0.3333333333333333), (3, 0.5)]", + "reasoning": "Counts are {1:1, 2:2, 3:3} over 6 samples, so probabilities are 1/6, 2/6, and 3/6 respectively, returned sorted by value." +} diff --git a/questions/182_empirical_probability_mass_function_(pmf)/learn.md b/questions/182_empirical_probability_mass_function_(pmf)/learn.md new file mode 100644 index 00000000..923306e3 --- /dev/null +++ b/questions/182_empirical_probability_mass_function_(pmf)/learn.md @@ -0,0 +1,24 @@ + +# Learn Section + +# Probability Mass Function (PMF) — Simple Explanation + +A **probability mass function (PMF)** describes how probabilities are assigned to the possible outcomes of a **discrete random variable**. + +- It tells you the chance of each specific outcome. +- Each probability is non-negative. +- The total of all probabilities adds up to 1. + +## Estimating from data +If the true probabilities are unknown, you can estimate them with an **empirical PMF**: +- Count how often each outcome appears. +- Divide by the total number of observations. + +## Example +Observed sequence: `1, 2, 2, 3, 3, 3` (6 outcomes total) +- “1” appears once → estimated probability = 1/6 +- “2” appears twice → estimated probability = 2/6 = 1/3 +- “3” appears three times → estimated probability = 3/6 = 1/2 + + + \ No newline at end of file diff --git a/questions/182_empirical_probability_mass_function_(pmf)/meta.json b/questions/182_empirical_probability_mass_function_(pmf)/meta.json new file mode 100644 index 00000000..a5fc5556 --- /dev/null +++ b/questions/182_empirical_probability_mass_function_(pmf)/meta.json @@ -0,0 +1,15 @@ +{ + "id": "182", + "title": "Empirical Probability Mass Function (PMF)", + "difficulty": "easy", + "category": "Probability & Statistics", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/jeetmukherjee", + "name": "jeetmukherjee" + } + ] +} diff --git a/questions/182_empirical_probability_mass_function_(pmf)/solution.py b/questions/182_empirical_probability_mass_function_(pmf)/solution.py new file mode 100644 index 00000000..b54775fe --- /dev/null +++ b/questions/182_empirical_probability_mass_function_(pmf)/solution.py @@ -0,0 +1,14 @@ +from collections import Counter + +def empirical_pmf(samples): + """ + Given an iterable of integer samples, return a list of (value, probability) + pairs sorted by value ascending. + """ + samples = list(samples) + if not samples: + return [] + total = len(samples) + cnt = Counter(samples) + result = [(k, cnt[k] / total) for k in sorted(cnt.keys())] + return result \ No newline at end of file diff --git a/questions/182_empirical_probability_mass_function_(pmf)/starter_code.py b/questions/182_empirical_probability_mass_function_(pmf)/starter_code.py new file mode 100644 index 00000000..32b35c14 --- /dev/null +++ b/questions/182_empirical_probability_mass_function_(pmf)/starter_code.py @@ -0,0 +1,7 @@ +def empirical_pmf(samples): + """ + Given an iterable of integer samples, return a list of (value, probability) + pairs sorted by value ascending. + """ + # TODO: Implement the function + pass diff --git a/questions/182_empirical_probability_mass_function_(pmf)/tests.json b/questions/182_empirical_probability_mass_function_(pmf)/tests.json new file mode 100644 index 00000000..d9cbb76b --- /dev/null +++ b/questions/182_empirical_probability_mass_function_(pmf)/tests.json @@ -0,0 +1,18 @@ +[ + { + "test": "print(empirical_pmf([1, 2, 2, 3, 3, 3]))", + "expected_output": "[(1, 0.16666666666666666), (2, 0.3333333333333333), (3, 0.5)]" + }, + { + "test": "print(empirical_pmf([5, 5, 5, 5]))", + "expected_output": "[(5, 1.0)]" + }, + { + "test": "print(empirical_pmf([]))", + "expected_output": "[]" + }, + { + "test": "print(empirical_pmf([0, 0, 1, 1, 1, 2]))", + "expected_output": "[(0, 0.3333333333333333), (1, 0.5), (2, 0.16666666666666666)]" + } +] diff --git a/questions/183_pmf_normalization_constant 2/description.md b/questions/183_pmf_normalization_constant 2/description.md new file mode 100644 index 00000000..7b00dfcc --- /dev/null +++ b/questions/183_pmf_normalization_constant 2/description.md @@ -0,0 +1,16 @@ +## Problem + +A discrete random variable `X` takes values 0 through 7 with probabilities: + +- P(X=0) = 0 +- P(X=1) = K +- P(X=2) = 2K +- P(X=3) = 2K +- P(X=4) = 3K +- P(X=5) = K^2 +- P(X=6) = 2K^2 +- P(X=7) = 7K^2 + K + +Find the value of the normalization constant `K` such that the above defines a valid PMF (i.e., probabilities are non‑negative and sum to 1). + +Implement a function `find_k()` that returns `K` as a Python float. diff --git a/questions/183_pmf_normalization_constant 2/example.json b/questions/183_pmf_normalization_constant 2/example.json new file mode 100644 index 00000000..6d92c364 --- /dev/null +++ b/questions/183_pmf_normalization_constant 2/example.json @@ -0,0 +1,5 @@ +{ + "input": "No input; call find_k()", + "output": "0.1", + "reasoning": "From the normalization condition, 10K^2 + 9K = 1 gives K = 0.1 (non-negative root)." +} diff --git a/questions/183_pmf_normalization_constant 2/learn.md b/questions/183_pmf_normalization_constant 2/learn.md new file mode 100644 index 00000000..a8835809 --- /dev/null +++ b/questions/183_pmf_normalization_constant 2/learn.md @@ -0,0 +1,16 @@ +## Solution Explanation + +For a valid PMF, probabilities must sum to 1. + +Sum all terms: + +- Linear in K: K + 2K + 2K + 3K + K = 9K +- Quadratic in K: K^2 + 2K^2 + 7K^2 = 10K^2 + +Therefore: 10K^2 + 9K = 1 => 10K^2 + 9K - 1 = 0 + +Solve the quadratic: K = [-9 ± sqrt(81 + 40)] / 20 = [-9 ± 11] / 20 + +Feasible solution (K ≥ 0): K = 2/20 = 0.1 + +So the normalization constant is K = 0.1. diff --git a/questions/183_pmf_normalization_constant 2/meta.json b/questions/183_pmf_normalization_constant 2/meta.json new file mode 100644 index 00000000..ddb93b38 --- /dev/null +++ b/questions/183_pmf_normalization_constant 2/meta.json @@ -0,0 +1,15 @@ +{ + "id": "183", + "title": "Find PMF Normalization Constant", + "difficulty": "easy", + "category": "Probability & Statistics", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/jeetmukherjee", + "name": "jeetmukherjee" + } + ] +} diff --git a/questions/183_pmf_normalization_constant 2/solution.py b/questions/183_pmf_normalization_constant 2/solution.py new file mode 100644 index 00000000..5836d3de --- /dev/null +++ b/questions/183_pmf_normalization_constant 2/solution.py @@ -0,0 +1,14 @@ +import math + +def find_k(): + """ + Solve 10*K^2 + 9*K - 1 = 0 and return the non-negative root. + """ + a = 10.0 + b = 9.0 + c = -1.0 + discriminant = b * b - 4 * a * c + sqrt_disc = math.sqrt(discriminant) + k1 = (-b + sqrt_disc) / (2 * a) + k2 = (-b - sqrt_disc) / (2 * a) + return k1 if k1 >= 0 else k2 diff --git a/questions/183_pmf_normalization_constant 2/starter_code.py b/questions/183_pmf_normalization_constant 2/starter_code.py new file mode 100644 index 00000000..0df463f9 --- /dev/null +++ b/questions/183_pmf_normalization_constant 2/starter_code.py @@ -0,0 +1,6 @@ +def find_k(): + """ + Return the normalization constant K for the given PMF as a float. + """ + # TODO: Solve for K from 10*K**2 + 9*K - 1 = 0 and return the non-negative root + pass diff --git a/questions/183_pmf_normalization_constant 2/tests.json b/questions/183_pmf_normalization_constant 2/tests.json new file mode 100644 index 00000000..d6cb3f10 --- /dev/null +++ b/questions/183_pmf_normalization_constant 2/tests.json @@ -0,0 +1,10 @@ +[ + { + "test": "print(round(find_k(), 10))", + "expected_output": "0.1" + }, + { + "test": "k = find_k(); s = 0 + k + 2*k + 2*k + 3*k + k**2 + 2*k**2 + (7*k**2 + k); print(round(s, 10))", + "expected_output": "1.0" + } +] From 1c96b238d64555c9513779677167e5d821496122 Mon Sep 17 00:00:00 2001 From: Jeet Mukherjee Date: Tue, 7 Oct 2025 17:50:15 +0530 Subject: [PATCH 2/6] Added new ML Ops questions --- build/184.json | 72 +++++++++++++++++++ .../184_mlops-etl-pipeline/description.md | 14 ++++ questions/184_mlops-etl-pipeline/example.json | 5 ++ questions/184_mlops-etl-pipeline/learn.md | 24 +++++++ questions/184_mlops-etl-pipeline/meta.json | 12 ++++ questions/184_mlops-etl-pipeline/solution.py | 43 +++++++++++ .../184_mlops-etl-pipeline/starter_code.py | 9 +++ questions/184_mlops-etl-pipeline/tests.json | 14 ++++ 8 files changed, 193 insertions(+) create mode 100644 build/184.json create mode 100644 questions/184_mlops-etl-pipeline/description.md create mode 100644 questions/184_mlops-etl-pipeline/example.json create mode 100644 questions/184_mlops-etl-pipeline/learn.md create mode 100644 questions/184_mlops-etl-pipeline/meta.json create mode 100644 questions/184_mlops-etl-pipeline/solution.py create mode 100644 questions/184_mlops-etl-pipeline/starter_code.py create mode 100644 questions/184_mlops-etl-pipeline/tests.json diff --git a/build/184.json b/build/184.json new file mode 100644 index 00000000..b595ca27 --- /dev/null +++ b/build/184.json @@ -0,0 +1,72 @@ +{ + "id": "184", + "title": "Build a Simple ETL Pipeline (MLOps)", + "difficulty": "medium", + "category": "MLOps", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/Jeet009", + "name": "Jeet Mukherjee" + } + ], + "tinygrad_difficulty": "medium", + "pytorch_difficulty": "medium", + "description": "## Problem\n\nImplement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation.\n\nGiven a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that:\n\n1. Extracts rows from the raw CSV text.\n2. Transforms data by:\n\t- Filtering only rows where `event_type == \"purchase\"`.\n\t- Converting `value` to float and dropping invalid rows.\n\t- Aggregating total purchase `value` per `user_id`.\n3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending.\n\nAssume small inputs (no external libs), handle extra whitespace, and ignore blank lines.", + "learn_section": "## Solution Explanation\n\nThis task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling.\n\n### ETL breakdown\n- Extract: parse raw CSV text, ignore blanks, and split into header and rows.\n- Transform:\n\t- Filter only relevant records (event_type == \"purchase\").\n\t- Cast `value` to float; discard invalid rows to maintain data quality.\n\t- Aggregate total purchase value per user to create compact features.\n- Load: return a deterministic, sorted list of `(user_id, total_value)`.\n\n### Why this design?\n- Input sanitation prevents runtime errors and poor-quality features.\n- Aggregation compresses event-level logs into user-level features commonly used in models.\n- Sorting produces stable, testable outputs.\n\n### Complexity\n- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U).\n\n### Extensions\n- Add schema validation and logging.\n- Write outputs to files or databases.\n- Schedule ETL runs and add monitoring for drift and freshness.", + "starter_code": "# Implement your function below.\n\ndef run_etl(csv_text: str) -> list[tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline over CSV text with header user_id,event_type,value.\n\n\tReturns a sorted list of (user_id, total_value) for event_type == \"purchase\".\n\t\"\"\"\n\t# TODO: implement extract, transform, and load steps\n\traise NotImplementedError", + "solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Reference ETL implementation.\n\n\t- Extract: parse CSV text, skip header, strip whitespace, ignore blanks\n\t- Transform: keep event_type == \"purchase\"; parse value as float; aggregate per user\n\t- Load: return sorted list of (user_id, total_value) by user_id asc\n\t\"\"\"\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\t# header\n\theader = lines[0]\n\trows = lines[1:]\n\n\t# indices from header (allow varying order and case)\n\theaders = [h.strip().lower() for h in header.split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\t# header missing required columns\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in rows:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\tevent_type = parts[idx_event].lower()\n\t\tif event_type != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])", + "example": { + "input": "run_etl(\"user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n\\n\")", + "output": "[('u1', 15.0), ('u2', 3.5)]", + "reasoning": "Keep only purchases; convert values; drop invalid; aggregate per user; sort by user_id." + }, + "test_cases": [ + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))", + "expected_output": "[('u1', 15.0), ('u2', 3.5)]" + }, + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))", + "expected_output": "[]" + }, + { + "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", + "expected_output": "[('u1', 3.0)]" + } + ], + "tinygrad_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError", + "tinygrad_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])", + "tinygrad_test_cases": [ + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))", + "expected_output": "[('u1', 15.0), ('u2', 3.5)]" + }, + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))", + "expected_output": "[]" + }, + { + "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", + "expected_output": "[('u1', 3.0)]" + } + ], + "pytorch_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError", + "pytorch_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])", + "pytorch_test_cases": [ + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))", + "expected_output": "[('u1', 15.0), ('u2', 3.5)]" + }, + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))", + "expected_output": "[]" + }, + { + "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", + "expected_output": "[('u1', 3.0)]" + } + ] +} \ No newline at end of file diff --git a/questions/184_mlops-etl-pipeline/description.md b/questions/184_mlops-etl-pipeline/description.md new file mode 100644 index 00000000..5adfa2df --- /dev/null +++ b/questions/184_mlops-etl-pipeline/description.md @@ -0,0 +1,14 @@ +## Problem + +Implement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation. + +Given a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that: + +1. Extracts rows from the raw CSV text. +2. Transforms data by: + - Filtering only rows where `event_type == "purchase"`. + - Converting `value` to float and dropping invalid rows. + - Aggregating total purchase `value` per `user_id`. +3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending. + +Assume small inputs (no external libs), handle extra whitespace, and ignore blank lines. diff --git a/questions/184_mlops-etl-pipeline/example.json b/questions/184_mlops-etl-pipeline/example.json new file mode 100644 index 00000000..84952417 --- /dev/null +++ b/questions/184_mlops-etl-pipeline/example.json @@ -0,0 +1,5 @@ +{ + "input": "run_etl(\"user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n\\n\")", + "output": "[('u1', 15.0), ('u2', 3.5)]", + "reasoning": "Keep only purchases; convert values; drop invalid; aggregate per user; sort by user_id." +} diff --git a/questions/184_mlops-etl-pipeline/learn.md b/questions/184_mlops-etl-pipeline/learn.md new file mode 100644 index 00000000..d523e6a1 --- /dev/null +++ b/questions/184_mlops-etl-pipeline/learn.md @@ -0,0 +1,24 @@ +## Solution Explanation + +This task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling. + +### ETL breakdown +- Extract: parse raw CSV text, ignore blanks, and split into header and rows. +- Transform: + - Filter only relevant records (event_type == "purchase"). + - Cast `value` to float; discard invalid rows to maintain data quality. + - Aggregate total purchase value per user to create compact features. +- Load: return a deterministic, sorted list of `(user_id, total_value)`. + +### Why this design? +- Input sanitation prevents runtime errors and poor-quality features. +- Aggregation compresses event-level logs into user-level features commonly used in models. +- Sorting produces stable, testable outputs. + +### Complexity +- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U). + +### Extensions +- Add schema validation and logging. +- Write outputs to files or databases. +- Schedule ETL runs and add monitoring for drift and freshness. diff --git a/questions/184_mlops-etl-pipeline/meta.json b/questions/184_mlops-etl-pipeline/meta.json new file mode 100644 index 00000000..83466df6 --- /dev/null +++ b/questions/184_mlops-etl-pipeline/meta.json @@ -0,0 +1,12 @@ +{ + "id": "184", + "title": "Build a Simple ETL Pipeline (MLOps)", + "difficulty": "medium", + "category": "MLOps", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { "profile_link": "https://github.com/Jeet009", "name": "Jeet Mukherjee" } + ] +} diff --git a/questions/184_mlops-etl-pipeline/solution.py b/questions/184_mlops-etl-pipeline/solution.py new file mode 100644 index 00000000..19b9a275 --- /dev/null +++ b/questions/184_mlops-etl-pipeline/solution.py @@ -0,0 +1,43 @@ +from typing import List, Tuple + + +def run_etl(csv_text: str) -> List[Tuple[str, float]]: + """Reference ETL implementation. + + - Extract: parse CSV text, skip header, strip whitespace, ignore blanks + - Transform: keep event_type == "purchase"; parse value as float; aggregate per user + - Load: return sorted list of (user_id, total_value) by user_id asc + """ + lines = [line.strip() for line in csv_text.splitlines() if line.strip()] + if not lines: + return [] + # header + header = lines[0] + rows = lines[1:] + + # indices from header (allow varying order and case) + headers = [h.strip().lower() for h in header.split(",")] + try: + idx_user = headers.index("user_id") + idx_event = headers.index("event_type") + idx_value = headers.index("value") + except ValueError: + # header missing required columns + return [] + + aggregates: dict[str, float] = {} + for row in rows: + parts = [c.strip() for c in row.split(",")] + if len(parts) <= max(idx_user, idx_event, idx_value): + continue + user_id = parts[idx_user] + event_type = parts[idx_event].lower() + if event_type != "purchase": + continue + try: + value = float(parts[idx_value]) + except ValueError: + continue + aggregates[user_id] = aggregates.get(user_id, 0.0) + value + + return sorted(aggregates.items(), key=lambda kv: kv[0]) diff --git a/questions/184_mlops-etl-pipeline/starter_code.py b/questions/184_mlops-etl-pipeline/starter_code.py new file mode 100644 index 00000000..65002026 --- /dev/null +++ b/questions/184_mlops-etl-pipeline/starter_code.py @@ -0,0 +1,9 @@ +# Implement your function below. + +def run_etl(csv_text: str) -> list[tuple[str, float]]: + """Run a simple ETL pipeline over CSV text with header user_id,event_type,value. + + Returns a sorted list of (user_id, total_value) for event_type == "purchase". + """ + # TODO: implement extract, transform, and load steps + raise NotImplementedError diff --git a/questions/184_mlops-etl-pipeline/tests.json b/questions/184_mlops-etl-pipeline/tests.json new file mode 100644 index 00000000..379e5d8e --- /dev/null +++ b/questions/184_mlops-etl-pipeline/tests.json @@ -0,0 +1,14 @@ +[ + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))", + "expected_output": "[('u1', 15.0), ('u2', 3.5)]" + }, + { + "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))", + "expected_output": "[]" + }, + { + "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", + "expected_output": "[('u1', 3.0)]" + } +] From 784bf192bb1a659c0b5ec023b721f0ee40059c64 Mon Sep 17 00:00:00 2001 From: Jeet Mukherjee Date: Tue, 7 Oct 2025 18:04:26 +0530 Subject: [PATCH 3/6] added new question on mlops --- build/184.json | 34 ----------------- build/185.json | 38 +++++++++++++++++++ questions/185_data-drift-basic/description.md | 13 +++++++ questions/185_data-drift-basic/example.json | 5 +++ questions/185_data-drift-basic/learn.md | 18 +++++++++ questions/185_data-drift-basic/meta.json | 12 ++++++ questions/185_data-drift-basic/solution.py | 24 ++++++++++++ .../185_data-drift-basic/starter_code.py | 10 +++++ questions/185_data-drift-basic/tests.json | 5 +++ 9 files changed, 125 insertions(+), 34 deletions(-) create mode 100644 build/185.json create mode 100644 questions/185_data-drift-basic/description.md create mode 100644 questions/185_data-drift-basic/example.json create mode 100644 questions/185_data-drift-basic/learn.md create mode 100644 questions/185_data-drift-basic/meta.json create mode 100644 questions/185_data-drift-basic/solution.py create mode 100644 questions/185_data-drift-basic/starter_code.py create mode 100644 questions/185_data-drift-basic/tests.json diff --git a/build/184.json b/build/184.json index b595ca27..a20aead7 100644 --- a/build/184.json +++ b/build/184.json @@ -12,8 +12,6 @@ "name": "Jeet Mukherjee" } ], - "tinygrad_difficulty": "medium", - "pytorch_difficulty": "medium", "description": "## Problem\n\nImplement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation.\n\nGiven a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that:\n\n1. Extracts rows from the raw CSV text.\n2. Transforms data by:\n\t- Filtering only rows where `event_type == \"purchase\"`.\n\t- Converting `value` to float and dropping invalid rows.\n\t- Aggregating total purchase `value` per `user_id`.\n3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending.\n\nAssume small inputs (no external libs), handle extra whitespace, and ignore blank lines.", "learn_section": "## Solution Explanation\n\nThis task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling.\n\n### ETL breakdown\n- Extract: parse raw CSV text, ignore blanks, and split into header and rows.\n- Transform:\n\t- Filter only relevant records (event_type == \"purchase\").\n\t- Cast `value` to float; discard invalid rows to maintain data quality.\n\t- Aggregate total purchase value per user to create compact features.\n- Load: return a deterministic, sorted list of `(user_id, total_value)`.\n\n### Why this design?\n- Input sanitation prevents runtime errors and poor-quality features.\n- Aggregation compresses event-level logs into user-level features commonly used in models.\n- Sorting produces stable, testable outputs.\n\n### Complexity\n- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U).\n\n### Extensions\n- Add schema validation and logging.\n- Write outputs to files or databases.\n- Schedule ETL runs and add monitoring for drift and freshness.", "starter_code": "# Implement your function below.\n\ndef run_etl(csv_text: str) -> list[tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline over CSV text with header user_id,event_type,value.\n\n\tReturns a sorted list of (user_id, total_value) for event_type == \"purchase\".\n\t\"\"\"\n\t# TODO: implement extract, transform, and load steps\n\traise NotImplementedError", @@ -36,37 +34,5 @@ "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", "expected_output": "[('u1', 3.0)]" } - ], - "tinygrad_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError", - "tinygrad_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])", - "tinygrad_test_cases": [ - { - "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))", - "expected_output": "[('u1', 15.0), ('u2', 3.5)]" - }, - { - "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))", - "expected_output": "[]" - }, - { - "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", - "expected_output": "[('u1', 3.0)]" - } - ], - "pytorch_starter_code": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline and return (user_id, total_value) sorted.\"\"\"\n\traise NotImplementedError", - "pytorch_solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\theaders = [h.strip().lower() for h in lines[0].split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in lines[1:]:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tif parts[idx_event].lower() != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])", - "pytorch_test_cases": [ - { - "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))", - "expected_output": "[('u1', 15.0), ('u2', 3.5)]" - }, - { - "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))", - "expected_output": "[]" - }, - { - "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", - "expected_output": "[('u1', 3.0)]" - } ] } \ No newline at end of file diff --git a/build/185.json b/build/185.json new file mode 100644 index 00000000..98cd23f8 --- /dev/null +++ b/build/185.json @@ -0,0 +1,38 @@ +{ + "id": "185", + "title": "Basic Data Drift Check: Mean and Variance Thresholds", + "difficulty": "easy", + "category": "MLOps", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/Jeet009", + "name": "Jeet Mukherjee" + } + ], + "description": "## Problem\n\nImplement a basic data drift check comparing two numeric datasets (reference vs. current).\n\nWrite a function `check_drift(ref, cur, mean_threshold, var_threshold)` that:\n\n- Accepts two lists of numbers `ref` and `cur`.\n- Computes the absolute difference in means and variances.\n- Returns a tuple `(mean_drift, var_drift)` where each element is a boolean indicating whether drift exceeds the corresponding threshold:\n\t- `mean_drift = abs(mean(ref) - mean(cur)) > mean_threshold`\n\t- `var_drift = abs(var(ref) - var(cur)) > var_threshold`\n\nAssume population variance (divide by N). Handle empty inputs by returning `(False, False)`.", + "learn_section": "## Solution Explanation\n\nWe compare two numeric samples (reference vs. current) using mean and variance with user-defined thresholds.\n\n### Definitions\n- Mean: \\( \\mu = \\frac{1}{N}\\sum_i x_i \\)\n- Population variance: \\( \\sigma^2 = \\frac{1}{N}\\sum_i (x_i - \\mu)^2 \\)\n\n### Drift rules\n- Mean drift if \\(|\\mu_{ref} - \\mu_{cur}| > \\tau_{mean}\\)\n- Variance drift if \\(|\\sigma^2_{ref} - \\sigma^2_{cur}| > \\tau_{var}\\)\n\n### Edge cases\n- If either sample is empty, return `(False, False)` to avoid false alarms.\n- Population vs. sample variance: we use population here to match many monitoring setups. Either is fine if used consistently.\n\n### Complexity\n- O(N + M) to compute stats; O(1) extra space.", + "starter_code": "from typing import List, Tuple\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\t\"\"\"Return (mean_drift, var_drift) comparing ref vs cur with given thresholds.\n\n\tUse population variance.\n\t\"\"\"\n\t# TODO: handle empty inputs; compute means and variances; compare with thresholds\n\traise NotImplementedError", + "solution": "from typing import List, Tuple\n\n\ndef _mean(xs: List[float]) -> float:\n\treturn sum(xs) / len(xs) if xs else 0.0\n\n\ndef _var(xs: List[float]) -> float:\n\tif not xs:\n\t\treturn 0.0\n\tm = _mean(xs)\n\treturn sum((x - m) * (x - m) for x in xs) / len(xs)\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\tif not ref or not cur:\n\t\treturn (False, False)\n\tmean_ref = _mean(ref)\n\tmean_cur = _mean(cur)\n\tvar_ref = _var(ref)\n\tvar_cur = _var(cur)\n\tmean_drift = abs(mean_ref - mean_cur) > mean_threshold\n\tvar_drift = abs(var_ref - var_cur) > var_threshold\n\treturn (mean_drift, var_drift)", + "example": { + "input": "check_drift([1, 2, 3], [1.1, 2.2, 3.3], 0.05, 0.1)", + "output": "(True, True)", + "reasoning": "Mean(ref)=2.0, Mean(cur)=2.2 → |Δ|=0.2>0.05. Var(ref)=2/3≈0.667; Var(cur)=1.21×0.667≈0.807 → |Δ|≈0.14>0.1." + }, + "test_cases": [ + { + "test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))", + "expected_output": "(True, True)" + }, + { + "test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))", + "expected_output": "(False, False)" + }, + { + "test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))", + "expected_output": "(False, False)" + } + ] +} \ No newline at end of file diff --git a/questions/185_data-drift-basic/description.md b/questions/185_data-drift-basic/description.md new file mode 100644 index 00000000..9b0815d3 --- /dev/null +++ b/questions/185_data-drift-basic/description.md @@ -0,0 +1,13 @@ +## Problem + +Implement a basic data drift check comparing two numeric datasets (reference vs. current). + +Write a function `check_drift(ref, cur, mean_threshold, var_threshold)` that: + +- Accepts two lists of numbers `ref` and `cur`. +- Computes the absolute difference in means and variances. +- Returns a tuple `(mean_drift, var_drift)` where each element is a boolean indicating whether drift exceeds the corresponding threshold: + - `mean_drift = abs(mean(ref) - mean(cur)) > mean_threshold` + - `var_drift = abs(var(ref) - var(cur)) > var_threshold` + +Assume population variance (divide by N). Handle empty inputs by returning `(False, False)`. diff --git a/questions/185_data-drift-basic/example.json b/questions/185_data-drift-basic/example.json new file mode 100644 index 00000000..960d233a --- /dev/null +++ b/questions/185_data-drift-basic/example.json @@ -0,0 +1,5 @@ +{ + "input": "check_drift([1, 2, 3], [1.1, 2.2, 3.3], 0.05, 0.1)", + "output": "(True, True)", + "reasoning": "Mean(ref)=2.0, Mean(cur)=2.2 → |Δ|=0.2>0.05. Var(ref)=2/3≈0.667; Var(cur)=1.21×0.667≈0.807 → |Δ|≈0.14>0.1." +} diff --git a/questions/185_data-drift-basic/learn.md b/questions/185_data-drift-basic/learn.md new file mode 100644 index 00000000..13d4baa5 --- /dev/null +++ b/questions/185_data-drift-basic/learn.md @@ -0,0 +1,18 @@ +## Solution Explanation + +We compare two numeric samples (reference vs. current) using mean and variance with user-defined thresholds. + +### Definitions +- Mean: \( \mu = \frac{1}{N}\sum_i x_i \) +- Population variance: \( \sigma^2 = \frac{1}{N}\sum_i (x_i - \mu)^2 \) + +### Drift rules +- Mean drift if \(|\mu_{ref} - \mu_{cur}| > \tau_{mean}\) +- Variance drift if \(|\sigma^2_{ref} - \sigma^2_{cur}| > \tau_{var}\) + +### Edge cases +- If either sample is empty, return `(False, False)` to avoid false alarms. +- Population vs. sample variance: we use population here to match many monitoring setups. Either is fine if used consistently. + +### Complexity +- O(N + M) to compute stats; O(1) extra space. diff --git a/questions/185_data-drift-basic/meta.json b/questions/185_data-drift-basic/meta.json new file mode 100644 index 00000000..a70df836 --- /dev/null +++ b/questions/185_data-drift-basic/meta.json @@ -0,0 +1,12 @@ +{ + "id": "185", + "title": "Basic Data Drift Check: Mean and Variance Thresholds", + "difficulty": "easy", + "category": "MLOps", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { "profile_link": "https://github.com/Jeet009", "name": "Jeet Mukherjee" } + ] +} diff --git a/questions/185_data-drift-basic/solution.py b/questions/185_data-drift-basic/solution.py new file mode 100644 index 00000000..867bdbbe --- /dev/null +++ b/questions/185_data-drift-basic/solution.py @@ -0,0 +1,24 @@ +from typing import List, Tuple + + +def _mean(xs: List[float]) -> float: + return sum(xs) / len(xs) if xs else 0.0 + + +def _var(xs: List[float]) -> float: + if not xs: + return 0.0 + m = _mean(xs) + return sum((x - m) * (x - m) for x in xs) / len(xs) + + +def check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]: + if not ref or not cur: + return (False, False) + mean_ref = _mean(ref) + mean_cur = _mean(cur) + var_ref = _var(ref) + var_cur = _var(cur) + mean_drift = abs(mean_ref - mean_cur) > mean_threshold + var_drift = abs(var_ref - var_cur) > var_threshold + return (mean_drift, var_drift) diff --git a/questions/185_data-drift-basic/starter_code.py b/questions/185_data-drift-basic/starter_code.py new file mode 100644 index 00000000..d47c256f --- /dev/null +++ b/questions/185_data-drift-basic/starter_code.py @@ -0,0 +1,10 @@ +from typing import List, Tuple + + +def check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]: + """Return (mean_drift, var_drift) comparing ref vs cur with given thresholds. + + Use population variance. + """ + # TODO: handle empty inputs; compute means and variances; compare with thresholds + raise NotImplementedError diff --git a/questions/185_data-drift-basic/tests.json b/questions/185_data-drift-basic/tests.json new file mode 100644 index 00000000..cd22070d --- /dev/null +++ b/questions/185_data-drift-basic/tests.json @@ -0,0 +1,5 @@ +[ + { "test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))", "expected_output": "(True, True)" }, + { "test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))", "expected_output": "(False, False)" }, + { "test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))", "expected_output": "(False, False)" } +] From 1ea7032bf12358dbb541abba595f7eb6e06196d1 Mon Sep 17 00:00:00 2001 From: Jeet Mukherjee Date: Fri, 10 Oct 2025 11:53:57 +0530 Subject: [PATCH 4/6] data drift problem --- .../description.md | 16 ------- .../example.json | 5 --- .../183_pmf_normalization_constant 2/learn.md | 16 ------- .../meta.json | 15 ------- .../solution.py | 14 ------ .../starter_code.py | 6 --- .../tests.json | 10 ----- .../184_mlops-etl-pipeline/description.md | 14 ------ questions/184_mlops-etl-pipeline/example.json | 5 --- questions/184_mlops-etl-pipeline/learn.md | 24 ----------- questions/184_mlops-etl-pipeline/meta.json | 12 ------ questions/184_mlops-etl-pipeline/solution.py | 43 ------------------- .../184_mlops-etl-pipeline/starter_code.py | 9 ---- questions/184_mlops-etl-pipeline/tests.json | 14 ------ 14 files changed, 203 deletions(-) delete mode 100644 questions/183_pmf_normalization_constant 2/description.md delete mode 100644 questions/183_pmf_normalization_constant 2/example.json delete mode 100644 questions/183_pmf_normalization_constant 2/learn.md delete mode 100644 questions/183_pmf_normalization_constant 2/meta.json delete mode 100644 questions/183_pmf_normalization_constant 2/solution.py delete mode 100644 questions/183_pmf_normalization_constant 2/starter_code.py delete mode 100644 questions/183_pmf_normalization_constant 2/tests.json delete mode 100644 questions/184_mlops-etl-pipeline/description.md delete mode 100644 questions/184_mlops-etl-pipeline/example.json delete mode 100644 questions/184_mlops-etl-pipeline/learn.md delete mode 100644 questions/184_mlops-etl-pipeline/meta.json delete mode 100644 questions/184_mlops-etl-pipeline/solution.py delete mode 100644 questions/184_mlops-etl-pipeline/starter_code.py delete mode 100644 questions/184_mlops-etl-pipeline/tests.json diff --git a/questions/183_pmf_normalization_constant 2/description.md b/questions/183_pmf_normalization_constant 2/description.md deleted file mode 100644 index 7b00dfcc..00000000 --- a/questions/183_pmf_normalization_constant 2/description.md +++ /dev/null @@ -1,16 +0,0 @@ -## Problem - -A discrete random variable `X` takes values 0 through 7 with probabilities: - -- P(X=0) = 0 -- P(X=1) = K -- P(X=2) = 2K -- P(X=3) = 2K -- P(X=4) = 3K -- P(X=5) = K^2 -- P(X=6) = 2K^2 -- P(X=7) = 7K^2 + K - -Find the value of the normalization constant `K` such that the above defines a valid PMF (i.e., probabilities are non‑negative and sum to 1). - -Implement a function `find_k()` that returns `K` as a Python float. diff --git a/questions/183_pmf_normalization_constant 2/example.json b/questions/183_pmf_normalization_constant 2/example.json deleted file mode 100644 index 6d92c364..00000000 --- a/questions/183_pmf_normalization_constant 2/example.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "input": "No input; call find_k()", - "output": "0.1", - "reasoning": "From the normalization condition, 10K^2 + 9K = 1 gives K = 0.1 (non-negative root)." -} diff --git a/questions/183_pmf_normalization_constant 2/learn.md b/questions/183_pmf_normalization_constant 2/learn.md deleted file mode 100644 index a8835809..00000000 --- a/questions/183_pmf_normalization_constant 2/learn.md +++ /dev/null @@ -1,16 +0,0 @@ -## Solution Explanation - -For a valid PMF, probabilities must sum to 1. - -Sum all terms: - -- Linear in K: K + 2K + 2K + 3K + K = 9K -- Quadratic in K: K^2 + 2K^2 + 7K^2 = 10K^2 - -Therefore: 10K^2 + 9K = 1 => 10K^2 + 9K - 1 = 0 - -Solve the quadratic: K = [-9 ± sqrt(81 + 40)] / 20 = [-9 ± 11] / 20 - -Feasible solution (K ≥ 0): K = 2/20 = 0.1 - -So the normalization constant is K = 0.1. diff --git a/questions/183_pmf_normalization_constant 2/meta.json b/questions/183_pmf_normalization_constant 2/meta.json deleted file mode 100644 index ddb93b38..00000000 --- a/questions/183_pmf_normalization_constant 2/meta.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "183", - "title": "Find PMF Normalization Constant", - "difficulty": "easy", - "category": "Probability & Statistics", - "video": "", - "likes": "0", - "dislikes": "0", - "contributor": [ - { - "profile_link": "https://github.com/jeetmukherjee", - "name": "jeetmukherjee" - } - ] -} diff --git a/questions/183_pmf_normalization_constant 2/solution.py b/questions/183_pmf_normalization_constant 2/solution.py deleted file mode 100644 index 5836d3de..00000000 --- a/questions/183_pmf_normalization_constant 2/solution.py +++ /dev/null @@ -1,14 +0,0 @@ -import math - -def find_k(): - """ - Solve 10*K^2 + 9*K - 1 = 0 and return the non-negative root. - """ - a = 10.0 - b = 9.0 - c = -1.0 - discriminant = b * b - 4 * a * c - sqrt_disc = math.sqrt(discriminant) - k1 = (-b + sqrt_disc) / (2 * a) - k2 = (-b - sqrt_disc) / (2 * a) - return k1 if k1 >= 0 else k2 diff --git a/questions/183_pmf_normalization_constant 2/starter_code.py b/questions/183_pmf_normalization_constant 2/starter_code.py deleted file mode 100644 index 0df463f9..00000000 --- a/questions/183_pmf_normalization_constant 2/starter_code.py +++ /dev/null @@ -1,6 +0,0 @@ -def find_k(): - """ - Return the normalization constant K for the given PMF as a float. - """ - # TODO: Solve for K from 10*K**2 + 9*K - 1 = 0 and return the non-negative root - pass diff --git a/questions/183_pmf_normalization_constant 2/tests.json b/questions/183_pmf_normalization_constant 2/tests.json deleted file mode 100644 index d6cb3f10..00000000 --- a/questions/183_pmf_normalization_constant 2/tests.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "test": "print(round(find_k(), 10))", - "expected_output": "0.1" - }, - { - "test": "k = find_k(); s = 0 + k + 2*k + 2*k + 3*k + k**2 + 2*k**2 + (7*k**2 + k); print(round(s, 10))", - "expected_output": "1.0" - } -] diff --git a/questions/184_mlops-etl-pipeline/description.md b/questions/184_mlops-etl-pipeline/description.md deleted file mode 100644 index 5adfa2df..00000000 --- a/questions/184_mlops-etl-pipeline/description.md +++ /dev/null @@ -1,14 +0,0 @@ -## Problem - -Implement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation. - -Given a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that: - -1. Extracts rows from the raw CSV text. -2. Transforms data by: - - Filtering only rows where `event_type == "purchase"`. - - Converting `value` to float and dropping invalid rows. - - Aggregating total purchase `value` per `user_id`. -3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending. - -Assume small inputs (no external libs), handle extra whitespace, and ignore blank lines. diff --git a/questions/184_mlops-etl-pipeline/example.json b/questions/184_mlops-etl-pipeline/example.json deleted file mode 100644 index 84952417..00000000 --- a/questions/184_mlops-etl-pipeline/example.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "input": "run_etl(\"user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n\\n\")", - "output": "[('u1', 15.0), ('u2', 3.5)]", - "reasoning": "Keep only purchases; convert values; drop invalid; aggregate per user; sort by user_id." -} diff --git a/questions/184_mlops-etl-pipeline/learn.md b/questions/184_mlops-etl-pipeline/learn.md deleted file mode 100644 index d523e6a1..00000000 --- a/questions/184_mlops-etl-pipeline/learn.md +++ /dev/null @@ -1,24 +0,0 @@ -## Solution Explanation - -This task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling. - -### ETL breakdown -- Extract: parse raw CSV text, ignore blanks, and split into header and rows. -- Transform: - - Filter only relevant records (event_type == "purchase"). - - Cast `value` to float; discard invalid rows to maintain data quality. - - Aggregate total purchase value per user to create compact features. -- Load: return a deterministic, sorted list of `(user_id, total_value)`. - -### Why this design? -- Input sanitation prevents runtime errors and poor-quality features. -- Aggregation compresses event-level logs into user-level features commonly used in models. -- Sorting produces stable, testable outputs. - -### Complexity -- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U). - -### Extensions -- Add schema validation and logging. -- Write outputs to files or databases. -- Schedule ETL runs and add monitoring for drift and freshness. diff --git a/questions/184_mlops-etl-pipeline/meta.json b/questions/184_mlops-etl-pipeline/meta.json deleted file mode 100644 index 83466df6..00000000 --- a/questions/184_mlops-etl-pipeline/meta.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "id": "184", - "title": "Build a Simple ETL Pipeline (MLOps)", - "difficulty": "medium", - "category": "MLOps", - "video": "", - "likes": "0", - "dislikes": "0", - "contributor": [ - { "profile_link": "https://github.com/Jeet009", "name": "Jeet Mukherjee" } - ] -} diff --git a/questions/184_mlops-etl-pipeline/solution.py b/questions/184_mlops-etl-pipeline/solution.py deleted file mode 100644 index 19b9a275..00000000 --- a/questions/184_mlops-etl-pipeline/solution.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import List, Tuple - - -def run_etl(csv_text: str) -> List[Tuple[str, float]]: - """Reference ETL implementation. - - - Extract: parse CSV text, skip header, strip whitespace, ignore blanks - - Transform: keep event_type == "purchase"; parse value as float; aggregate per user - - Load: return sorted list of (user_id, total_value) by user_id asc - """ - lines = [line.strip() for line in csv_text.splitlines() if line.strip()] - if not lines: - return [] - # header - header = lines[0] - rows = lines[1:] - - # indices from header (allow varying order and case) - headers = [h.strip().lower() for h in header.split(",")] - try: - idx_user = headers.index("user_id") - idx_event = headers.index("event_type") - idx_value = headers.index("value") - except ValueError: - # header missing required columns - return [] - - aggregates: dict[str, float] = {} - for row in rows: - parts = [c.strip() for c in row.split(",")] - if len(parts) <= max(idx_user, idx_event, idx_value): - continue - user_id = parts[idx_user] - event_type = parts[idx_event].lower() - if event_type != "purchase": - continue - try: - value = float(parts[idx_value]) - except ValueError: - continue - aggregates[user_id] = aggregates.get(user_id, 0.0) + value - - return sorted(aggregates.items(), key=lambda kv: kv[0]) diff --git a/questions/184_mlops-etl-pipeline/starter_code.py b/questions/184_mlops-etl-pipeline/starter_code.py deleted file mode 100644 index 65002026..00000000 --- a/questions/184_mlops-etl-pipeline/starter_code.py +++ /dev/null @@ -1,9 +0,0 @@ -# Implement your function below. - -def run_etl(csv_text: str) -> list[tuple[str, float]]: - """Run a simple ETL pipeline over CSV text with header user_id,event_type,value. - - Returns a sorted list of (user_id, total_value) for event_type == "purchase". - """ - # TODO: implement extract, transform, and load steps - raise NotImplementedError diff --git a/questions/184_mlops-etl-pipeline/tests.json b/questions/184_mlops-etl-pipeline/tests.json deleted file mode 100644 index 379e5d8e..00000000 --- a/questions/184_mlops-etl-pipeline/tests.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))", - "expected_output": "[('u1', 15.0), ('u2', 3.5)]" - }, - { - "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))", - "expected_output": "[]" - }, - { - "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))", - "expected_output": "[('u1', 3.0)]" - } -] From 406782dc38be5ebbc6a14dbc39271b8b26ce39cd Mon Sep 17 00:00:00 2001 From: Jeet Mukherjee Date: Sun, 12 Oct 2025 22:25:21 +0530 Subject: [PATCH 5/6] final commit --- .../description.md | 3 --- .../example.json | 5 ---- .../learn.md | 24 ------------------- .../meta.json | 15 ------------ .../solution.py | 14 ----------- .../starter_code.py | 7 ------ .../tests.json | 18 -------------- 7 files changed, 86 deletions(-) delete mode 100644 questions/182_empirical_probability_mass_function_(pmf)/description.md delete mode 100644 questions/182_empirical_probability_mass_function_(pmf)/example.json delete mode 100644 questions/182_empirical_probability_mass_function_(pmf)/learn.md delete mode 100644 questions/182_empirical_probability_mass_function_(pmf)/meta.json delete mode 100644 questions/182_empirical_probability_mass_function_(pmf)/solution.py delete mode 100644 questions/182_empirical_probability_mass_function_(pmf)/starter_code.py delete mode 100644 questions/182_empirical_probability_mass_function_(pmf)/tests.json diff --git a/questions/182_empirical_probability_mass_function_(pmf)/description.md b/questions/182_empirical_probability_mass_function_(pmf)/description.md deleted file mode 100644 index 9a49f526..00000000 --- a/questions/182_empirical_probability_mass_function_(pmf)/description.md +++ /dev/null @@ -1,3 +0,0 @@ -## Problem - -Given a list of integer samples drawn from a discrete distribution, implement a function to compute the empirical Probability Mass Function (PMF). The function should return a list of `(value, probability)` pairs sorted by the value in ascending order. If the input is empty, return an empty list. diff --git a/questions/182_empirical_probability_mass_function_(pmf)/example.json b/questions/182_empirical_probability_mass_function_(pmf)/example.json deleted file mode 100644 index a00d5e97..00000000 --- a/questions/182_empirical_probability_mass_function_(pmf)/example.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "input": "samples = [1, 2, 2, 3, 3, 3]", - "output": "[(1, 0.16666666666666666), (2, 0.3333333333333333), (3, 0.5)]", - "reasoning": "Counts are {1:1, 2:2, 3:3} over 6 samples, so probabilities are 1/6, 2/6, and 3/6 respectively, returned sorted by value." -} diff --git a/questions/182_empirical_probability_mass_function_(pmf)/learn.md b/questions/182_empirical_probability_mass_function_(pmf)/learn.md deleted file mode 100644 index 923306e3..00000000 --- a/questions/182_empirical_probability_mass_function_(pmf)/learn.md +++ /dev/null @@ -1,24 +0,0 @@ - -# Learn Section - -# Probability Mass Function (PMF) — Simple Explanation - -A **probability mass function (PMF)** describes how probabilities are assigned to the possible outcomes of a **discrete random variable**. - -- It tells you the chance of each specific outcome. -- Each probability is non-negative. -- The total of all probabilities adds up to 1. - -## Estimating from data -If the true probabilities are unknown, you can estimate them with an **empirical PMF**: -- Count how often each outcome appears. -- Divide by the total number of observations. - -## Example -Observed sequence: `1, 2, 2, 3, 3, 3` (6 outcomes total) -- “1” appears once → estimated probability = 1/6 -- “2” appears twice → estimated probability = 2/6 = 1/3 -- “3” appears three times → estimated probability = 3/6 = 1/2 - - - \ No newline at end of file diff --git a/questions/182_empirical_probability_mass_function_(pmf)/meta.json b/questions/182_empirical_probability_mass_function_(pmf)/meta.json deleted file mode 100644 index a5fc5556..00000000 --- a/questions/182_empirical_probability_mass_function_(pmf)/meta.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "182", - "title": "Empirical Probability Mass Function (PMF)", - "difficulty": "easy", - "category": "Probability & Statistics", - "video": "", - "likes": "0", - "dislikes": "0", - "contributor": [ - { - "profile_link": "https://github.com/jeetmukherjee", - "name": "jeetmukherjee" - } - ] -} diff --git a/questions/182_empirical_probability_mass_function_(pmf)/solution.py b/questions/182_empirical_probability_mass_function_(pmf)/solution.py deleted file mode 100644 index b54775fe..00000000 --- a/questions/182_empirical_probability_mass_function_(pmf)/solution.py +++ /dev/null @@ -1,14 +0,0 @@ -from collections import Counter - -def empirical_pmf(samples): - """ - Given an iterable of integer samples, return a list of (value, probability) - pairs sorted by value ascending. - """ - samples = list(samples) - if not samples: - return [] - total = len(samples) - cnt = Counter(samples) - result = [(k, cnt[k] / total) for k in sorted(cnt.keys())] - return result \ No newline at end of file diff --git a/questions/182_empirical_probability_mass_function_(pmf)/starter_code.py b/questions/182_empirical_probability_mass_function_(pmf)/starter_code.py deleted file mode 100644 index 32b35c14..00000000 --- a/questions/182_empirical_probability_mass_function_(pmf)/starter_code.py +++ /dev/null @@ -1,7 +0,0 @@ -def empirical_pmf(samples): - """ - Given an iterable of integer samples, return a list of (value, probability) - pairs sorted by value ascending. - """ - # TODO: Implement the function - pass diff --git a/questions/182_empirical_probability_mass_function_(pmf)/tests.json b/questions/182_empirical_probability_mass_function_(pmf)/tests.json deleted file mode 100644 index d9cbb76b..00000000 --- a/questions/182_empirical_probability_mass_function_(pmf)/tests.json +++ /dev/null @@ -1,18 +0,0 @@ -[ - { - "test": "print(empirical_pmf([1, 2, 2, 3, 3, 3]))", - "expected_output": "[(1, 0.16666666666666666), (2, 0.3333333333333333), (3, 0.5)]" - }, - { - "test": "print(empirical_pmf([5, 5, 5, 5]))", - "expected_output": "[(5, 1.0)]" - }, - { - "test": "print(empirical_pmf([]))", - "expected_output": "[]" - }, - { - "test": "print(empirical_pmf([0, 0, 1, 1, 1, 2]))", - "expected_output": "[(0, 0.3333333333333333), (1, 0.5), (2, 0.16666666666666666)]" - } -] From 76131c649be89cfb76512f76b99411234b951960 Mon Sep 17 00:00:00 2001 From: Jeet Mukherjee Date: Thu, 16 Oct 2025 18:07:47 +0530 Subject: [PATCH 6/6] fixed test cases --- questions/185_data-drift-basic/tests.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/questions/185_data-drift-basic/tests.json b/questions/185_data-drift-basic/tests.json index cd22070d..744583cd 100644 --- a/questions/185_data-drift-basic/tests.json +++ b/questions/185_data-drift-basic/tests.json @@ -1,5 +1,5 @@ [ - { "test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))", "expected_output": "(True, True)" }, - { "test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))", "expected_output": "(False, False)" }, - { "test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))", "expected_output": "(False, False)" } + { "test": "check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1)", "expected_output": "(True, True)" }, + { "test": "check_drift([0,0,0], [0,0,0], 0.01, 0.01)", "expected_output": "(False, False)" }, + { "test": "check_drift([], [1,2,3], 0.01, 0.01)", "expected_output": "(False, False)" } ]