#Installing the libraries with the specified versions
!pip install numpy==2.0.2 pandas==2.2.2 scikit-learn==1.6.1 matplotlib==3.10.0 seaborn==0.13.2 joblib==1.4.2 xgboost==2.1.4 requests==2.32.3 huggingface_hub==0.30.1 -q

import warnings
warnings.filterwarnings("ignore")         # Suppress warnings for cleaner output

# -------------------- Data Handling --------------------
import numpy as np                        # Numerical operations
import pandas as pd                       # Data manipulation and analysis

# -------------------- Visualization --------------------
import matplotlib.pyplot as plt           # Basic plotting
import seaborn as sns                     # Statistical visualization

# -------------------- Statistics --------------------
from scipy.stats import pearsonr          # For calculating Pearson correlation

# -------------------- Model Development --------------------
from sklearn.model_selection import train_test_split       # Train-test split
from sklearn.ensemble import RandomForestRegressor         # Random Forest model
from xgboost import XGBRegressor                           # XGBoost model
from sklearn.tree import DecisionTreeRegressor             # Decision tree model

# -------------------- Evaluation Metrics --------------------
from sklearn.metrics import (
    mean_squared_error,              # RMSE
    mean_absolute_error,             # MAE
    r2_score,                        # R-squared
    mean_absolute_percentage_error   # MAPE
)
from sklearn import metrics          # For scoring in model selection

# -------------------- Preprocessing & Pipeline --------------------
from sklearn.compose import make_column_transformer              # Column-wise transformations
from sklearn.pipeline import make_pipeline, Pipeline             # Pipeline creation
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # Preprocessing tools

# -------------------- Model Tuning --------------------
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  # Hyperparameter tuning

# -------------------- Deployment --------------------
from flask import Flask, request, jsonify  # Flask API for backend deployment
import joblib                              # Model serialization
import os                                  # Directory management
import requests                            # For frontend-backend API calls
import time                                # Timing operations

# -------------------- Hugging Face Deployment --------------------
from huggingface_hub import login, HfApi   # Hugging Face authentication & uploads

# -------------------- Google Colab Utility --------------------
from google.colab import files             # Downloading files from Colab

# Mount Google Drive to access files stored under 'My Drive'
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/

# Load the dataset from Google Drive (adjust path if needed)
data_path = "/content/drive/My Drive/Colab Notebooks/Project 7/SuperKart.csv"
kart_data = pd.read_csv(data_path)

# Create a working copy to preserve the original data
dataset = kart_data.copy()

# Display the first 5 rows of the dataset
dataset.head()

# Display the last 5 rows of the dataset
dataset.tail()

# Print the number of rows and columns in the dataset
print(f"The dataset contains {dataset.shape[0]} rows and {dataset.shape[1]} columns.")

The dataset contains 8763 rows and 12 columns.

# Display column names, data types, and non-null counts
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Product_Id                 8763 non-null   object 
 1   Product_Weight             8763 non-null   float64
 2   Product_Sugar_Content      8763 non-null   object 
 3   Product_Allocated_Area     8763 non-null   float64
 4   Product_Type               8763 non-null   object 
 5   Product_MRP                8763 non-null   float64
 6   Store_Id                   8763 non-null   object 
 7   Store_Establishment_Year   8763 non-null   int64  
 8   Store_Size                 8763 non-null   object 
 9   Store_Location_City_Type   8763 non-null   object 
 10  Store_Type                 8763 non-null   object 
 11  Product_Store_Sales_Total  8763 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 821.7+ KB

# Count total missing values per column
dataset.isnull().sum()

# Check for total number of duplicate rows in the dataset
duplicate_count = dataset.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0

# Count the number of unique values in each column
dataset.nunique()

# checking unique values for Product_Sugar_Content
dataset['Product_Sugar_Content'].unique()

array(['Low Sugar', 'Regular', 'No Sugar', 'reg'], dtype=object)

# Standardize inconsistent category labels
print("Before cleanup:", dataset['Product_Sugar_Content'].value_counts(dropna=False))
dataset['Product_Sugar_Content'] = dataset['Product_Sugar_Content'].replace({'Reg': 'Regular', 'reg': 'Regular'})
print("After cleanup:", dataset['Product_Sugar_Content'].value_counts(dropna=False))

Before cleanup: Product_Sugar_Content
Low Sugar    4885
Regular      2251
No Sugar     1519
reg           108
Name: count, dtype: int64
After cleanup: Product_Sugar_Content
Low Sugar    4885
Regular      2359
No Sugar     1519
Name: count, dtype: int64

# Identify categorical columns based on object dtype
categorical_cols = dataset.select_dtypes(include='object').columns

# Display value counts for each categorical column
for col in categorical_cols:
    print(f"\nValue counts for '{col}':\n")
    print(dataset[col].value_counts())

Value counts for 'Product_Id':

Product_Id
FD306     1
FD6114    1
FD7839    1
FD5075    1
FD8233    1
         ..
FD1387    1
FD1231    1
FD5276    1
FD8553    1
FD6027    1
Name: count, Length: 8763, dtype: int64

Value counts for 'Product_Sugar_Content':

Product_Sugar_Content
Low Sugar    4885
Regular      2359
No Sugar     1519
Name: count, dtype: int64

Value counts for 'Product_Type':

Product_Type
Fruits and Vegetables    1249
Snack Foods              1149
Frozen Foods              811
Dairy                     796
Household                 740
Baking Goods              716
Canned                    677
Health and Hygiene        628
Meat                      618
Soft Drinks               519
Breads                    200
Hard Drinks               186
Others                    151
Starchy Foods             141
Breakfast                 106
Seafood                    76
Name: count, dtype: int64

Value counts for 'Store_Id':

Store_Id
OUT004    4676
OUT001    1586
OUT003    1349
OUT002    1152
Name: count, dtype: int64

Value counts for 'Store_Size':

Store_Size
Medium    6025
High      1586
Small     1152
Name: count, dtype: int64

Value counts for 'Store_Location_City_Type':

Store_Location_City_Type
Tier 2    6262
Tier 1    1349
Tier 3    1152
Name: count, dtype: int64

Value counts for 'Store_Type':

Store_Type
Supermarket Type2     4676
Supermarket Type1     1586
Departmental Store    1349
Food Mart             1152
Name: count, dtype: int64

# Display descriptive statistics for all columns
# .T transposes the table to make it easier to read with column names as row headers.
dataset.describe(include="all").T

# Define the target variable for prediction
target = 'Product_Store_Sales_Total'

# Manually selected numerical features (excluding identifier columns)
numeric_features = [
    'Product_Weight',
    'Product_Allocated_Area',
    'Product_MRP',
    'Store_Establishment_Year'  # Will transform to Store_Age later
]

# Manually selected categorical features based on domain knowledge
categorical_features = [
    'Product_Sugar_Content',
    'Product_Type',
    'Store_Size',
    'Store_Location_City_Type',
    'Store_Type'
]

# List of all numeric columns in the dataset (for quick analysis)
cols_list = dataset.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Print the selected and detected columns for verification
print("All numeric columns detected:", cols_list)
print("-" * 100)
print("Target variable:", target)
print("-" * 100)
print("Manually selected numeric features:", numeric_features)
print("-" * 100)
print("Categorical features:", categorical_features)

All numeric columns detected: ['Product_Weight', 'Product_Allocated_Area', 'Product_MRP', 'Store_Establishment_Year', 'Product_Store_Sales_Total']
----------------------------------------------------------------------------------------------------
Target variable: Product_Store_Sales_Total
----------------------------------------------------------------------------------------------------
Manually selected numeric features: ['Product_Weight', 'Product_Allocated_Area', 'Product_MRP', 'Store_Establishment_Year']
----------------------------------------------------------------------------------------------------
Categorical features: ['Product_Sugar_Content', 'Product_Type', 'Store_Size', 'Store_Location_City_Type', 'Store_Type']

# Utility function to plot histogram and boxplot for a given numerical column
def plot_univariate(column_name, df):
    """
    Visualizes the distribution and outliers of a numerical column using a histogram and boxplot.

    Parameters:
    column_name (str): Name of the numerical column to plot
    df (pd.DataFrame): DataFrame containing the column
    """

    # Safety check for valid column name
    if column_name not in df.columns:
        print(f"❌ Column '{column_name}' not found in the DataFrame.")
        return

    # Set up figure layout
    plt.figure(figsize=(12, 4))
    plt.suptitle(f'Univariate Analysis: {column_name}', fontsize=16, fontweight='bold', y=1.05)

    # Plot histogram with kernel density estimate
    plt.subplot(1, 2, 1)
    sns.histplot(df[column_name], kde=True, bins=30, color='skyblue')
    plt.title("Distribution")
    plt.xlabel(column_name)

    # Plot boxplot to detect outliers
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[column_name], color='salmon')
    plt.title("Boxplot")
    plt.xlabel(column_name)

    plt.tight_layout()
    plt.show()

# Analyze the distribution and outliers in Product_Weight
plot_univariate('Product_Weight', dataset)

# Analyze the distribution and outliers in Product_Allocated_Area
plot_univariate('Product_Allocated_Area', dataset)

# Analyze the distribution and outliers in Product_MRP
plot_univariate('Product_MRP', dataset)

# Analyze the distribution and outliers in Store_Establishment_Year
plot_univariate('Store_Establishment_Year', dataset)

# Define a utility function to graph categorical features using countplots
def plot_categorical_eda(data, col):
    """
    Displays the distribution of a categorical feature using a countplot,
    and prints the frequency of each category.

    Parameters:
    - data (pd.DataFrame): The dataset
    - col (str): Column name of the categorical feature
    """
    plt.figure(figsize=(10, 5))

    # Plot the count of each category
    sns.countplot(data=data, x=col, order=data[col].value_counts().index)
    plt.title(f"Category Counts for '{col}'", fontsize=14)
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Print raw value counts
    print(f"\nValue Counts for '{col}':")
    print(data[col].value_counts())

# plot 'Product_Sugar_Content':
plot_categorical_eda(dataset, 'Product_Sugar_Content')

Value Counts for 'Product_Sugar_Content':
Product_Sugar_Content
Low Sugar    4885
Regular      2359
No Sugar     1519
Name: count, dtype: int64

# plot 'Product_Type':
plot_categorical_eda(dataset, 'Product_Type')

Value Counts for 'Product_Type':
Product_Type
Fruits and Vegetables    1249
Snack Foods              1149
Frozen Foods              811
Dairy                     796
Household                 740
Baking Goods              716
Canned                    677
Health and Hygiene        628
Meat                      618
Soft Drinks               519
Breads                    200
Hard Drinks               186
Others                    151
Starchy Foods             141
Breakfast                 106
Seafood                    76
Name: count, dtype: int64

# plot 'Store_Size':
plot_categorical_eda(dataset, 'Store_Size')

Value Counts for 'Store_Size':
Store_Size
Medium    6025
High      1586
Small     1152
Name: count, dtype: int64

# plot 'Store_Location_City_Type':
plot_categorical_eda(dataset, 'Store_Location_City_Type')

Value Counts for 'Store_Location_City_Type':
Store_Location_City_Type
Tier 2    6262
Tier 1    1349
Tier 3    1152
Name: count, dtype: int64

# plot 'Store_Type':
plot_categorical_eda(dataset, 'Store_Type')

Value Counts for 'Store_Type':
Store_Type
Supermarket Type2     4676
Supermarket Type1     1586
Departmental Store    1349
Food Mart             1152
Name: count, dtype: int64

# Print correlation matrix re-ordered to show correlation to the target variable `Product_Store_Sales_Total`
target = 'Product_Store_Sales_Total'
corr_matrix = dataset[cols_list].corr()

# Sort by correlation with the target
sorted_corr = corr_matrix[[target]].sort_values(by=target, ascending=False)

# Visualize only top N correlated features
top_features = sorted_corr.index.tolist()

plt.figure(figsize=(10, 6))
sns.heatmap(
    corr_matrix.loc[top_features, top_features],
    annot=True, fmt=".2f", cmap="Blues", vmin=-1, vmax=1, linewidths=0.5, linecolor="gray"
)
plt.title("Correlation Matrix (Sorted by Target)")
plt.show()

# Scatter plots of numeric features vs target
# Let's start with defining a utility function
def plot_scatter_vs_target(df, feature, target='Product_Store_Sales_Total'):
    """
    Generate a scatter plot between a numeric feature and the target variable,
    including the Pearson correlation coefficient in the title.

    Parameters:
    df (DataFrame): The dataset
    feature (str): The numeric column to plot against the target
    target (str): The target variable (default is 'Product_Store_Sales_Total')
    """
    # Drop NA values to avoid issues with correlation
    plot_data = df[[feature, target]].dropna()

    # Calculate Pearson correlation
    corr_coef, _ = pearsonr(plot_data[feature], plot_data[target])

    # Plot
    plt.figure(figsize=(6, 4))
    sns.scatterplot(data=plot_data, x=feature, y=target, alpha=0.6)
    plt.title(f'{feature} vs {target}\n(Pearson r = {corr_coef:.2f})')
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Display scatter plot of our target against `Product_Weight`
plot_scatter_vs_target(dataset, 'Product_Weight')

# Display scatter plot of our target against `Product_Allocated_Area`
plot_scatter_vs_target(dataset, 'Product_Allocated_Area')

# Display scatter plot of our target against `Product_MRP`
plot_scatter_vs_target(dataset, 'Product_MRP')

# Display scatter plot of our target against `Store_Establishment_Year`
plot_scatter_vs_target(dataset, 'Store_Establishment_Year')

# Define a utility function to graph categorical values using barplots.
def plot_categorical_insights(df, cat_col, target='Product_Store_Sales_Total'):
    """
    Display a boxplot followed by a barplot for a categorical feature against the target variable.

    Parameters:
    - df (DataFrame): The dataset
    - cat_col (str): The name of the categorical column
    - target (str): The target variable to analyze
    """

    # Barplot (Mean Aggregation)
    plt.figure(figsize=(10, 5))
    mean_values = df.groupby(cat_col)[target].mean().sort_values(ascending=False)
    sns.barplot(x=mean_values.index, y=mean_values.values)
    plt.title(f'Average {target} by {cat_col} (Barplot)')
    plt.ylabel(f'Mean {target}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Spacer
    print("\n")  # adds a blank line in output

    # Boxplot
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=cat_col, y=target, data=df)
    plt.title(f'Distribution of {target} by {cat_col} (Boxplot)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Display barplot and boxplot of our target against `Store_Type`
plot_categorical_insights(dataset, 'Store_Type')

# Display barplot and boxplot of our target against `Store_Size`
plot_categorical_insights(dataset, 'Store_Size')

# Display barplot and boxplot of our target against `Store_Location_City_Type`
plot_categorical_insights(dataset, 'Store_Location_City_Type')

# Display barplot and boxplot of our target against `Product_Type`
plot_categorical_insights(dataset, 'Product_Type')

# Display barplot and boxplot of our target against `Product_Sugar_Content`
plot_categorical_insights(dataset, 'Product_Sugar_Content')

# Display barplot and boxplot of our target against `Store_Id`
plot_categorical_insights(dataset, 'Store_Id')

# Group, aggregate, and sort by total revenue
store_avg_info = (
    dataset.groupby('Store_Id')[['Store_Size', 'Store_Type', 'Product_Store_Sales_Total']]
    .agg({
        'Store_Size': 'first',
        'Store_Type': 'first',
        'Product_Store_Sales_Total': 'mean'
    })
    .reset_index()
    .sort_values(by='Product_Store_Sales_Total', ascending=False)
)

# Display the DataFrame in a clean tabular format
# Format large numbers with commas
store_avg_info['Product_Store_Sales_Total'] = store_avg_info['Product_Store_Sales_Total'].apply(lambda x: f"{x:,.0f}")

# Show the DataFrame
from IPython.display import display
display(store_avg_info)

# Generate table displaying total sales revenue by Product Type per Store
pivot_sales_revenue = dataset.pivot_table(
    index='Product_Type',
    columns='Store_Id',
    values='Product_Store_Sales_Total',
    aggfunc='sum',
    fill_value=0
)

pivot_sales_revenue = pivot_sales_revenue.sort_values(by=pivot_sales_revenue.columns.tolist(), ascending=False)
display(pivot_sales_revenue)

# Generate heatmap displaying total sales revenue by Product Type per Store
# Pivot table: sum of sales by Product_Type and Store_Id

pivot_table = dataset.pivot_table(
    index='Product_Type',
    columns='Store_Id',
    values='Product_Store_Sales_Total',
    aggfunc='sum',
    fill_value=0
)

# Plot the heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(pivot_table, annot=True, fmt=".0f", cmap="YlGnBu", linewidths=0.5, linecolor='gray')
plt.title("Heatmap of Total Sales by Product Type and Store")
plt.ylabel("Product Type")
plt.xlabel("Store ID")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Define a utility function to output each store's data.
# Create barplot of Total Sales by Product Type
def explore_store(store_id):
    store_data = dataset[dataset["Store_Id"] == store_id]

    print(f"🔍 Summary for Store: {store_id}")               # Print the overall summary table for store
    display(store_data.describe(include="all").T)

    print("-" * 100)                                         # Print a separator line between tables

    print("\n📊 Product Type Distribution:")                 # Print the overall store distribution by Product Type
    display(store_data["Product_Type"].value_counts())

    print("-" * 100)                                         # Print a separator line between tables

    print("\n📈 Revenue Distribution by Product Type:")      # Print the overall revenue distribution by Product Type
    display(store_data.groupby("Product_Type")["Product_Store_Sales_Total"].sum().sort_values(ascending=False))

    print("-" * 100)                                         # Print a separator line between tables

    # Barplot of sales per product type
    plt.figure(figsize=(10, 4))
    sns.barplot(
        data=store_data,
        y="Product_Type",
        x="Product_Store_Sales_Total",
        estimator=sum,
        errorbar=None
    )
    plt.title(f"Total Sales by Product Type for {store_id}")
    plt.tight_layout()
    plt.show()

# Call the above function for Store OUT001
explore_store("OUT001")

🔍 Summary for Store: OUT001

----------------------------------------------------------------------------------------------------

📊 Product Type Distribution:

----------------------------------------------------------------------------------------------------

📈 Revenue Distribution by Product Type:

----------------------------------------------------------------------------------------------------

# Call the above function for Store OUT002
explore_store("OUT002")

🔍 Summary for Store: OUT002

----------------------------------------------------------------------------------------------------

📊 Product Type Distribution:

----------------------------------------------------------------------------------------------------

📈 Revenue Distribution by Product Type:

----------------------------------------------------------------------------------------------------

# Call the above function for Store OUT003
explore_store("OUT003")

🔍 Summary for Store: OUT003

----------------------------------------------------------------------------------------------------

📊 Product Type Distribution:

----------------------------------------------------------------------------------------------------

📈 Revenue Distribution by Product Type:

----------------------------------------------------------------------------------------------------

# Call the above function for Store OUT004
explore_store("OUT004")

🔍 Summary for Store: OUT004

----------------------------------------------------------------------------------------------------

📊 Product Type Distribution:

----------------------------------------------------------------------------------------------------

📈 Revenue Distribution by Product Type:

----------------------------------------------------------------------------------------------------

# Loop through all selected categorical features
# and print the percentage distribution of each category
for col in categorical_features:
    print(f"\nColumn: {col}")
    print(dataset[col].value_counts(normalize=True).round(3) * 100)

Column: Product_Sugar_Content
Product_Sugar_Content
Low Sugar    55.7
Regular      26.9
No Sugar     17.3
Name: proportion, dtype: float64

Column: Product_Type
Product_Type
Fruits and Vegetables    14.3
Snack Foods              13.1
Frozen Foods              9.3
Dairy                     9.1
Household                 8.4
Baking Goods              8.2
Canned                    7.7
Health and Hygiene        7.2
Meat                      7.1
Soft Drinks               5.9
Breads                    2.3
Hard Drinks               2.1
Others                    1.7
Starchy Foods             1.6
Breakfast                 1.2
Seafood                   0.9
Name: proportion, dtype: float64

Column: Store_Size
Store_Size
Medium    68.8
High      18.1
Small     13.1
Name: proportion, dtype: float64

Column: Store_Location_City_Type
Store_Location_City_Type
Tier 2    71.5
Tier 1    15.4
Tier 3    13.1
Name: proportion, dtype: float64

Column: Store_Type
Store_Type
Supermarket Type2     53.4
Supermarket Type1     18.1
Departmental Store    15.4
Food Mart             13.1
Name: proportion, dtype: float64

# Detect outliers using the IQR method for all numeric features
for col in numeric_features:
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    q1 = dataset[col].quantile(0.25)
    q3 = dataset[col].quantile(0.75)
    iqr = q3 - q1

    # Identify data points that fall outside 1.5 * IQR
    outliers = dataset[(dataset[col] < q1 - 1.5 * iqr) | (dataset[col] > q3 + 1.5 * iqr)]

    # Print count of outliers
    print(f"{col}: {len(outliers)} outliers")

Product_Weight: 54 outliers
Product_Allocated_Area: 104 outliers
Product_MRP: 57 outliers
Store_Establishment_Year: 0 outliers

# Re-check and display only columns with missing values (if any)
missing_data = dataset.isnull().sum()
missing_data = missing_data[missing_data > 0]
print(missing_data if not missing_data.empty else "No missing data found.")

No missing data found.

# Check skewness for numeric features and target
# Values > 1 or < -1 are considered highly skewed
dataset[numeric_features + [target]].skew().sort_values(ascending=False)

# Log-transform 'Product_Allocated_Area' to reduce right skewness
# log1p is used instead of log to handle zero values safely
dataset['Product_Allocated_Area_Log'] = np.log1p(dataset['Product_Allocated_Area'])

# Reconstruct the original column from the log-transformed version because I made a mistake below.
# Recomment this out after I have it correct.
# dataset['Product_Allocated_Area'] = np.expm1(dataset['Product_Allocated_Area_Log'])

# Drop the new Product_Allocated_Area_Log column
#dataset.drop(columns=['Product_Allocated_Area_Log'], inplace=True)

# Drop the original skewed column
dataset.drop(columns=['Product_Allocated_Area'], inplace=True)

# Display remaining columns to confirm changes
print("Current columns in dataset:")
print(dataset.columns)

Current columns in dataset:
Index(['Product_Id', 'Product_Weight', 'Product_Sugar_Content', 'Product_Type',
       'Product_MRP', 'Store_Id', 'Store_Establishment_Year', 'Store_Size',
       'Store_Location_City_Type', 'Store_Type', 'Product_Store_Sales_Total',
       'Product_Allocated_Area_Log'],
      dtype='object')

# Grouping Product_Type into Perishables vs. Non Perishables
perishable_types = [
    'Dairy', 'Fruits and Vegetables', 'Meat', 'Breads', 'Seafood'
]

# Create a new column 'Product_Type_Category' with grouped values
dataset['Product_Type_Category'] = dataset['Product_Type'].apply(
    lambda x: 'Perishables' if x in perishable_types else 'Non Perishables'
)

# Drop the original high-cardinality column
dataset.drop(columns=['Product_Type'], inplace=True)

# Confirm the transformation
print(dataset['Product_Type_Category'].value_counts())

Product_Type_Category
Non Perishables    5824
Perishables        2939
Name: count, dtype: int64

# Re-Print the first 5 rows of the dataset
dataset.head()

# Drop unnecessary columns (exclude 'Product_Type' - already dropped earlier)
data = dataset.drop(
    ["Product_Id", "Store_Id", "Store_Establishment_Year"],
    axis=1,
    errors='ignore'  # ignores columns that aren't found
)

# Print the number of rows and columns
print("Shape of the data:", data.shape)
print("-" * 100)  # Print a divider line

# Print the first 5 rows of the data
data.head()

Shape of the data: (8763, 9)
----------------------------------------------------------------------------------------------------

# Separate features and target variable
X = data.drop("Product_Store_Sales_Total", axis=1)  # Feature set
y = data["Product_Store_Sales_Total"]               # Target variable

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# Check the shape of the splits
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (7010, 8)
X_test shape: (1753, 8)
y_train shape: (7010,)
y_test shape: (1753,)

# Automatically detect categorical features
categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Display detected categorical features
print("Categorical Features:", categorical_features)

Categorical Features: ['Product_Sugar_Content', 'Store_Size', 'Store_Location_City_Type', 'Store_Type', 'Product_Type_Category']

# Define a preprocessor that applies OneHotEncoding to all categorical features
preprocessor = make_column_transformer(
    (Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_features),
    remainder='passthrough'  # Leave all non-categorical columns (numerical) unchanged
)

# function to compute adjusted R-squared
def adj_r2_score(predictors, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = predictors.shape[0]
    k = predictors.shape[1]
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))


# function to compute different metrics to check performance of a regression model
def model_performance_regression(model, predictors, target):
    """
    Function to compute different metrics to check regression model performance

    model: regressor
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    r2 = r2_score(target, pred)  # to compute R-squared
    adjr2 = adj_r2_score(predictors, target, pred)  # to compute adjusted R-squared
    rmse = np.sqrt(mean_squared_error(target, pred))  # to compute RMSE
    mae = mean_absolute_error(target, pred)  # to compute MAE
    mape = mean_absolute_percentage_error(target, pred)  # to compute MAPE

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "RMSE": rmse,
            "MAE": mae,
            "R-squared": r2,
            "Adj. R-squared": adjr2,
            "MAPE": mape,
        },
        index=[0],
    )

    return df_perf

# Define base Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Create pipeline with preprocessing and Random Forest model
rf_pipeline = make_pipeline(preprocessor, rf_model)

# Train the model pipeline on the training data
rf_pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['Product_Sugar_Content', 'Store_Size',
                                  'Store_Location_City_Type', 'Store_Type',
                                  'Product_Type_Category'])])

['Product_Sugar_Content', 'Store_Size', 'Store_Location_City_Type', 'Store_Type', 'Product_Type_Category']

OneHotEncoder(handle_unknown='ignore', sparse_output=False)

['Product_Weight', 'Product_MRP', 'Product_Allocated_Area_Log']

passthrough

RandomForestRegressor(random_state=42)

# Evaluate performance on training data
rf_estimator_model_train_perf = model_performance_regression(rf_pipeline, X_train,y_train)
print("Training performance \n")
rf_estimator_model_train_perf

Training performance

# Evaluate performance on test data
rf_estimator_model_test_perf = model_performance_regression(rf_pipeline, X_test,y_test)
print("Testing performance \n")
rf_estimator_model_test_perf

Testing performance

# Choose the type of classifier.
rf_tuned = RandomForestRegressor(random_state=42)

# Create pipeline with preprocessing and RandomForestRegressor model
rf_pipeline = make_pipeline(preprocessor, rf_tuned)

# Grid of parameters to choose from
parameters = parameters = {
    'randomforestregressor__max_depth':[3, 4, 5, 6],
    'randomforestregressor__max_features': ['sqrt','log2',None],
    'randomforestregressor__n_estimators': [50, 75, 100, 125, 150]
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.r2_score)

# Run the grid search
grid_obj = GridSearchCV(rf_pipeline, parameters, scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
rf_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data.
rf_tuned.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=6, max_features=None,
                                       n_estimators=150, random_state=42))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=6, max_features=None,
                                       n_estimators=150, random_state=42))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['Product_Sugar_Content', 'Store_Size',
                                  'Store_Location_City_Type', 'Store_Type',
                                  'Product_Type_Category'])])

['Product_Sugar_Content', 'Store_Size', 'Store_Location_City_Type', 'Store_Type', 'Product_Type_Category']

OneHotEncoder(handle_unknown='ignore', sparse_output=False)

['Product_Weight', 'Product_MRP', 'Product_Allocated_Area_Log']

passthrough

RandomForestRegressor(max_depth=6, max_features=None, n_estimators=150,
                      random_state=42)

# Evaluate performance on training data
rf_tuned_model_train_perf = model_performance_regression(rf_tuned, X_train, y_train)
print("Training performance \n")
rf_tuned_model_train_perf

Training performance

# Evaluate performance on test data
rf_tuned_model_test_perf = model_performance_regression(rf_tuned, X_test, y_test)
print("Testing performance \n")
rf_tuned_model_test_perf

Testing performance

# Define base XGBoost model
xgb_model = XGBRegressor(random_state=42)

# Create pipeline with preprocessing and XGBoost model
xgb_pipeline = make_pipeline(preprocessor, xgb_model)

# Train the model pipeline on the training data
xgb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('xgbregressor',
                 XGBRegressor(base_score=None, boos...
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, multi_strategy=None,
                              n_estimators=None, n_jobs=None,
                              num_parallel_tree=None, random_state=42, ...))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('xgbregressor',
                 XGBRegressor(base_score=None, boos...
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, multi_strategy=None,
                              n_estimators=None, n_jobs=None,
                              num_parallel_tree=None, random_state=42, ...))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['Product_Sugar_Content', 'Store_Size',
                                  'Store_Location_City_Type', 'Store_Type',
                                  'Product_Type_Category'])])

['Product_Sugar_Content', 'Store_Size', 'Store_Location_City_Type', 'Store_Type', 'Product_Type_Category']

OneHotEncoder(handle_unknown='ignore', sparse_output=False)

['Product_Weight', 'Product_MRP', 'Product_Allocated_Area_Log']

passthrough

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

# Evaluate performance on training data
xgb_estimator_model_train_perf = model_performance_regression(xgb_pipeline, X_train, y_train)
print("Training performance \n")
xgb_estimator_model_train_perf

Training performance

# Evaluate performance on test data
xgb_estimator_model_test_perf = model_performance_regression(xgb_pipeline, X_test,y_test)
print("Testing performance \n")
xgb_estimator_model_test_perf

Testing performance

# Choose the type of classifier
xgb_tuned = XGBRegressor(random_state=42)

# Create pipeline with preprocessing and XGBoost model
xgb_pipeline = make_pipeline(preprocessor, xgb_tuned)

#Grid of parameters to choose from
param_grid = {
    'xgbregressor__n_estimators': [50, 100, 150, 200],    # number of trees to build
    'xgbregressor__max_depth': [2, 3, 4],    # maximum depth of each tree
    'xgbregressor__colsample_bytree': [0.4, 0.5, 0.6],    # percentage of attributes to be considered (randomly) for each tree
    'xgbregressor__colsample_bylevel': [0.4, 0.5, 0.6],    # percentage of attributes to be considered (randomly) for each level of a tree
    'xgbregressor__learning_rate': [0.01, 0.05, 0.1],    # learning rate
    'xgbregressor__reg_lambda': [0.4, 0.5, 0.6],    # L2 regularization factor
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.r2_score)

from sklearn.model_selection import RandomizedSearchCV

# Run the randomized search
grid_obj = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_grid,
    n_iter=100,
    scoring=scorer,
    cv=3,
    n_jobs=-1,
    random_state=42
)

# Start timing
start = time.time()

# Fit the randomized search
grid_obj = grid_obj.fit(X_train, y_train)

# End timing
end = time.time()
print(f"Tuning completed in {end - start:.2f} seconds")

# Best hyperparameters
print("Best hyperparameters found:")
print(grid_obj.best_params_)
print("-" * 80)

# Set the best estimator
xgb_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the training data
xgb_tuned.fit(X_train, y_train)

Tuning completed in 21.09 seconds
Best hyperparameters found:
{'xgbregressor__reg_lambda': 0.5, 'xgbregressor__n_estimators': 150, 'xgbregressor__max_depth': 4, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__colsample_bytree': 0.6, 'xgbregressor__colsample_bylevel': 0.6}
--------------------------------------------------------------------------------

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('xgbregressor',
                 XGBRegressor(base_score=None, boos...
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.1,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=4, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, multi_strategy=None,
                              n_estimators=150, n_jobs=None,
                              num_parallel_tree=None, random_state=42, ...))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Product_Sugar_Content',
                                                   'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type',
                                                   'Product_Type_Category'])])),
                ('xgbregressor',
                 XGBRegressor(base_score=None, boos...
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.1,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=4, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, multi_strategy=None,
                              n_estimators=150, n_jobs=None,
                              num_parallel_tree=None, random_state=42, ...))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['Product_Sugar_Content', 'Store_Size',
                                  'Store_Location_City_Type', 'Store_Type',
                                  'Product_Type_Category'])])

['Product_Sugar_Content', 'Store_Size', 'Store_Location_City_Type', 'Store_Type', 'Product_Type_Category']

OneHotEncoder(handle_unknown='ignore', sparse_output=False)

['Product_Weight', 'Product_MRP', 'Product_Allocated_Area_Log']

passthrough

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=0.6, colsample_bynode=None, colsample_bytree=0.6,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=150,
             n_jobs=None, num_parallel_tree=None, random_state=42, ...)

# Evaluate performance on training data
xgb_tuned_model_train_perf = model_performance_regression(xgb_tuned, X_train, y_train)
print("Training performance \n")
xgb_tuned_model_train_perf

Training performance

# Evaluate performance on test data
xgb_tuned_model_test_perf = model_performance_regression(xgb_tuned, X_test, y_test)
print("Testing performance \n")
xgb_tuned_model_test_perf

Testing performance

# training performance comparison

models_train_comp_df = pd.concat(
    [rf_estimator_model_train_perf.T,rf_tuned_model_train_perf.T,
    xgb_estimator_model_train_perf.T,xgb_tuned_model_train_perf.T],
    axis=1,
)

models_train_comp_df.columns = [
    "Random Forest Estimator",
    "Random Forest Tuned",
    "XGBoost",
    "XGBoost Tuned"
]

print("Training performance comparison:")
models_train_comp_df

Training performance comparison:

# Testing performance comparison

models_test_comp_df = pd.concat(
    [rf_estimator_model_test_perf.T,rf_tuned_model_test_perf.T,
    xgb_estimator_model_test_perf.T,xgb_tuned_model_test_perf.T],
    axis=1,
)

models_test_comp_df.columns = [
    "Random Forest Estimator",
    "Random Forest Tuned",
    "XGBoost",
    "XGBoost Tuned"
]

print("Testing performance comparison:")
models_test_comp_df

Testing performance comparison:

# Display the gap between the R-squared of Train vs. Test for each model.
(models_train_comp_df - models_test_comp_df).iloc[2]

# Step 1: Define the model pipeline (already created earlier as rf_pipeline)
# Make sure it's fitted before serialization
rf_pipeline = rf_pipeline.fit(X_train, y_train)

# Step 2: Set Google Drive path for deployment-ready files
drive_path = "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files"
os.makedirs(drive_path, exist_ok=True)

# Step 3: Define model filename and save path
model_filename = "superkart_sales_forecast_model_v1_0.joblib"
saved_model_path = os.path.join(drive_path, model_filename)

# Step 4: Save the trained model pipeline
joblib.dump(rf_pipeline, saved_model_path)
print(f"✅ Model saved successfully at: {saved_model_path}")

# Step 5: Download to local (optional)
files.download(saved_model_path)

# Step 6: Reload the model to test deserialization
loaded_model = joblib.load(saved_model_path)
print("✅ Model loaded successfully.")

# Step 7: Run a prediction test
sample_preds = loaded_model.predict(X_test)
print("📈 Sample predictions from reloaded model:", sample_preds[:5])

✅ Model saved successfully at: /content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/superkart_sales_forecast_model_v1_0.joblib

✅ Model loaded successfully.
📈 Sample predictions from reloaded model: [3426.8403 3365.5683 2444.6257 1899.3955 4706.1235]

# Create the backend app.py file
%%writefile "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/backend_files/app.py"

import numpy as np
import pandas as pd
import joblib
from flask import Flask, request, jsonify
from flask_cors import CORS

# Initialize Flask app
superkart_api = Flask("superkart_sales_api")
CORS(superkart_api)

# Load the trained model pipeline (preprocessing + model)
model = joblib.load("/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/superkart_sales_forecast_model_v1_0.joblib")

# Health check route
@superkart_api.get('/')
def home():
    return "✅ Welcome to the SuperKart Sales Prediction API"

# Prediction route
@superkart_api.post('/v1/predict')
def predict_sales():
    try:
        # Parse JSON payload
        data = request.get_json()
        print("Raw incoming data:", data)

        # Validate expected fields
        required_fields = [
            'Product_Weight',
            'Product_Sugar_Content',
            'Product_Allocated_Area',
            'Product_MRP',
            'Store_Size',
            'Store_Location_City_Type',
            'Store_Type',
            'Store_Age_Years',
            'Product_Type_Category'
        ]
        missing_fields = [f for f in required_fields if f not in data]
        if missing_fields:
            return jsonify({'error': f"Missing fields: {missing_fields}"}), 400

        # Convert and transform input
        sample = {
            'Product_Weight': float(data['Product_Weight']),
            'Product_Sugar_Content': data['Product_Sugar_Content'],
            'Product_Allocated_Area_Log': np.log1p(float(data['Product_Allocated_Area'])),  # transform here
            'Product_MRP': float(data['Product_MRP']),
            'Store_Size': data['Store_Size'],
            'Store_Location_City_Type': data['Store_Location_City_Type'],
            'Store_Type': data['Store_Type'],
            'Store_Age_Years': int(data['Store_Age_Years']),
            'Product_Type_Category': data['Product_Type_Category']
        }

        input_df = pd.DataFrame([sample])
        print("Transformed input for model:\n", input_df)

        # Make prediction
        prediction = model.predict(input_df).tolist()[0]
        return jsonify({'Predicted_Sales': prediction})

    except Exception as e:
        print("❌ Error during prediction:", str(e))
        return jsonify({'error': f"Prediction failed: {str(e)}"}), 500

# Run the app (for local testing only)
if __name__ == '__main__':
    superkart_api.run(debug=True)

Overwriting /content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/backend_files/app.py

# Write requirements file to my Google Drive
%%writefile "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/backend_files/requirements.txt"
# Core libraries
pandas==2.2.2
numpy==2.0.2
scikit-learn==1.6.1
seaborn==0.13.2
joblib==1.4.2
xgboost==2.1.4

# Flask web server
flask==2.2.2
flask-cors==3.0.10
gunicorn==20.1.0
Werkzeug==2.2.2

# For API testing
requests==2.32.3

Overwriting /content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/backend_files/requirements.txt

%%writefile "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/backend_files/Dockerfile"
FROM python:3.9-slim

# Set the working directory inside the container
WORKDIR /app

# Copy all files from the current directory to the container's working directory
COPY . .

# Install dependencies from the requirements file without using cache to reduce image size
RUN pip install --no-cache-dir --upgrade -r requirements.txt

# Define the command to start the application using Gunicorn with 4 worker processes
# - `-w 4`: Uses 4 worker processes for handling requests
# - `-b 0.0.0.0:7860`: Binds the server to port 7860 on all network interfaces
# - `app:superkart_api`: Runs the Flask app (Flask app instance is named `superkart_api` inside app.py)
CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:7860", "app:superkart_api"]

Overwriting /content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/backend_files/Dockerfile

# Import the login function from the huggingface_hub library
from huggingface_hub import login

# Add my login token to huggingspace below
login(token="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")

# Import the create_repo function from the huggingface_hub library
from huggingface_hub import create_repo

# Try to create the repository for the Hugging Face Space
try:
    create_repo("ThomasH007/superkart-sales-forecast-backend2",  # Define the repo
        repo_type="space",  # We're creating a space
        space_sdk="docker",  # Because we're using a Docker backend
        private=False  # Make it private if needed
    )
except Exception as e:
    # Handle potential errors during repository creation
    if "RepositoryAlreadyExistsError" in str(e):
        print("✅ Repository already exists. Skipping creation.")
    else:
        print(f"❌ Error creating repository: {e}")

from huggingface_hub import HfApi, login

# Hugging Face access token
access_key = "hf_q"  # Replace with your actual token
repo_id = "ThomasH007/superkart-sales-forecast-backend2"  # Your Hugging Face Space ID

# Login to Hugging Face
login(token=access_key)

# Initialize the API
api = HfApi()

# Upload my files to the Hugging Face Space repo
api.upload_folder(
    folder_path="/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/backend_files",
    repo_id=repo_id,
    repo_type="space"
)

print("✅ Files uploaded to Hugging Face successfully.")

# Note* - Once uploaded, update my app.py to remove the Google path to the root path inside huggingface

✅ Files uploaded to Hugging Face successfully.

# Create a folder on Google drive for storing the files needed for frontend UI deployment
drive_path = "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files"
os.makedirs(drive_path, exist_ok=True)

%%writefile "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files/app.py"

# Streamlit Web App for SuperKart Sales Forecasting
import streamlit as st
import requests
import numpy as np

# Add Logo
st.image("https://i.postimg.cc/2yM4LgJM/Superkart-notebook-cover-image.png", width=400)

# App Title
st.title("🛒 SuperKart Sales Forecasting App")

# Instructions
st.markdown("🔍 Enter product and store attributes to forecast **monthly product sales revenue**.\n\n_All sales are reported in ($) USD._")

# User Inputs
Product_Weight = st.number_input("Product Weight (oz)", min_value=0.0, value=12.66)
Product_Sugar_Content = st.selectbox("Product Sugar Content", ["Low Sugar", "Regular", "No Sugar"])
Product_Allocated_Area = st.number_input("Product Allocated Area (linear in.)", min_value=0.0, value=100.0)
Product_MRP = st.number_input("Maximum Retail Price (USD)", min_value=0.0, value=150.0)
Store_Size = st.selectbox("Store Size", ["Small", "Medium", "High"])
Store_Location_City_Type = st.selectbox("Store Location City Type", ["Tier 1", "Tier 2", "Tier 3"])
Store_Type = st.selectbox("Store Type", ["Supermarket Type1", "Supermarket Type2", "Departmental Store", "Food Mart"])
Store_Age_Years = st.slider("Store Age (years)", min_value=0, max_value=30, value=10)
Product_Type_Category = st.selectbox("Product Type Category", ["Perishables", "Non Perishables"])

# Apply log1p transform (must match backend model training)
Product_Allocated_Area_Log = np.log1p(Product_Allocated_Area)

# Prepare JSON payload for the backend
product_data = {
    "Product_Weight": str(Product_Weight),
    "Product_Sugar_Content": Product_Sugar_Content,
    "Product_Allocated_Area": str(Product_Allocated_Area),
    "Product_MRP": str(Product_MRP),
    "Store_Size": Store_Size,
    "Store_Location_City_Type": Store_Location_City_Type,
    "Store_Type": Store_Type,
    "Store_Age_Years": str(Store_Age_Years),
    "Product_Type_Category": Product_Type_Category
}

# Trigger Prediction
if st.button("Predict", type='primary'):
    try:
        response = requests.post(
            "https://thomash007-superkart-sales-forecast-backend2.hf.space/v1/predict",
            json=product_data
        )
        if response.status_code == 200:
            result = response.json()
            predicted_sales = result["Predicted_Sales"]
            st.success(f"📈 Predicted Monthly Sales: **${predicted_sales:,.2f} USD**")
        else:
            st.error("❌ API Error: Please verify input values or try again later.")
    except Exception as e:
        st.error(f"⚠️ Connection error: {e}")

Overwriting /content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files/app.py

# Create the dependencies file for the front end
%%writefile "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files/requirements.txt"
requests==2.32.3
streamlit==1.45.0

Overwriting /content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files/requirements.txt

# Create the docker file for the front end
%%writefile "/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files/Dockerfile"
# Use a minimal base image with Python 3.9 installed
FROM python:3.9-slim

# Set the working directory inside the container to /app
WORKDIR /app

# Copy all files from the current directory on the host to the container's /app directory
COPY . .

# Install Python dependencies listed in requirements.txt
RUN pip3 install -r requirements.txt

# Define the command to run the Streamlit app on port 7860 and make it accessible externally
CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]

# NOTE: Disable XSRF protection for easier external access in order to make batch predictions

Overwriting /content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files/Dockerfile

# Upload files to my Huggingface front end space
access_key = "hfq"  # My Hugging Face token created from access keys in write mode
repo_id = "ThomasH007/superkart-sales-forecast-frontend"  # My Hugging Face space id

# Login to Hugging Face platform with the access token
login(token=access_key)

# Initialize the API
api = HfApi()

# Upload Streamlit app files stored in the folder called deployment_files
api.upload_folder(
    folder_path="/content/drive/My Drive/Colab Notebooks/Project 7/deployment_files/frontend_files/",  # Googe Drive folder path
    repo_id=repo_id,  # Hugging face space id
    repo_type="space",  # Hugging face repo type "space"
)

CommitInfo(commit_url='https://huggingface.co/spaces/ThomasH007/superkart-sales-forecast-frontend/commit/fbe1af33bc72ecb8eb69cb03a71bfacc17993728', commit_message='Upload folder using huggingface_hub', commit_description='', oid='fbe1af33bc72ecb8eb69cb03a71bfacc17993728', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/ThomasH007/superkart-sales-forecast-frontend', endpoint='https://huggingface.co', repo_type='space', repo_id='ThomasH007/superkart-sales-forecast-frontend'), pr_revision=None, pr_num=None)

import json  # To handle JSON formatting for API requests and responses
import requests  # To send HTTP requests to the deployed Flask API

import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations

model_root_url = "https://ThomasH007-superkart-sales-forecast-backend2.hf.space"  # Base URL of the deployed backend Flask API on Hugging Face Space

model_url = model_root_url + "/v1/predict"  # Endpoint for online inference

payload = {
    "Product_Weight": "12.66",
    "Product_Sugar_Content": "Low Sugar",
    "Product_Allocated_Area": "100.00",
    "Product_MRP": "150.00",
    "Store_Size": "Small",
    "Store_Location_City_Type": "Tier 1",
    "Store_Type": "Food Mart",
    "Store_Age_Years": "10",
    "Product_Type_Category": "Perishables"
}

# This payload dictionary includes all the necessary features in the expected
# format for online forecast prediction, ensuring consistency with the
# model's training data.

# Sending a POST request to the model endpoint with the test payload
response = requests.post(model_url, json=payload)

response

<Response [200]>

print(response.json())

{'Predicted_Sales': 2670.9885999999997}

# List files in Directory Project 7
import os
os.listdir("/content/drive/MyDrive/Colab Notebooks/Project 7/")

['SuperKart.csv',
 'deployment_files',
 'Thomas_Hall_Full_Code_SuperKart_Model_Deployment_Notebook.ipynb']

!jupyter nbconvert --ClearMetadataPreprocessor.enabled=True \
--to html \
"/content/drive/MyDrive/Colab Notebooks/Project 7/Thomas_Hall_Full_Code_SuperKart_Model_Deployment_Notebook.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/Project 7/Thomas_Hall_Full_Code_SuperKart_Model_Deployment_Notebook.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 33 image(s).
[NbConvertApp] Writing 3037563 bytes to /content/drive/MyDrive/Colab Notebooks/Project 7/Thomas_Hall_Full_Code_SuperKart_Model_Deployment_Notebook.html

# Prompt for download.
from google.colab import files
files.download('/content/drive/MyDrive/Colab Notebooks/Project 7/Thomas_Hall_Full_Code_SuperKart_Model_Deployment_Notebook.html')

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	8763	8763	FD306	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	8763.0	NaN	NaN	NaN	12.653792	2.21732	4.0	11.15	12.66	14.18	22.0
Product_Sugar_Content	8763	3	Low Sugar	4885	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	8763.0	NaN	NaN	NaN	0.068786	0.048204	0.004	0.031	0.056	0.096	0.298
Product_Type	8763	16	Fruits and Vegetables	1249	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	8763.0	NaN	NaN	NaN	147.032539	30.69411	31.0	126.16	146.74	167.585	266.0
Store_Id	8763	4	OUT004	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	8763.0	NaN	NaN	NaN	2002.032751	8.388381	1987.0	1998.0	2009.0	2009.0	2009.0
Store_Size	8763	3	Medium	6025	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	8763	3	Tier 2	6262	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	8763	4	Supermarket Type2	4676	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	8763.0	NaN	NaN	NaN	3464.00364	1065.630494	33.0	2761.715	3452.34	4145.165	8000.0

Store_Id	OUT001	OUT002	OUT003	OUT004
Product_Type
Snack Foods	806142.24	255317.57	918510.44	2009026.70
Fruits and Vegetables	792992.59	298503.56	897437.46	2311899.66
Dairy	598767.62	178888.18	715814.94	1318447.30
Frozen Foods	558556.81	180295.95	597608.42	1473519.65
Household	531371.38	184665.65	523981.64	1324721.50
Baking Goods	525131.04	169860.50	491908.20	1266086.26
Meat	505867.28	151800.01	520939.68	950604.97
Canned	449016.38	151467.66	452445.17	1247153.50
Health and Hygiene	435005.31	164660.81	439139.18	1124901.91
Soft Drinks	410548.69	103808.35	365046.30	917641.38
Hard Drinks	152920.74	54281.85	110760.30	307851.73
Others	123977.09	32835.73	159963.75	224719.73
Breads	121274.09	43419.47	175391.93	374856.75
Starchy Foods	120443.98	20044.98	143538.60	234746.89
Seafood	52936.84	17663.35	65337.48	136466.37
Breakfast	38161.10	23396.10	95634.08	204939.13

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	1586	1586	NC7187	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	1586.0	NaN	NaN	NaN	13.458865	2.064975	6.16	12.0525	13.96	14.95	17.97
Product_Sugar_Content	1586	3	Low Sugar	845	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1586.0	NaN	NaN	NaN	0.068768	0.047131	0.004	0.033	0.0565	0.094	0.295
Product_Type	1586	16	Snack Foods	202	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1586.0	NaN	NaN	NaN	160.514054	30.359059	71.35	141.72	168.32	182.9375	226.59
Store_Id	1586	1	OUT001	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1586.0	NaN	NaN	NaN	1987.0	0.0	1987.0	1987.0	1987.0	1987.0	1987.0
Store_Size	1586	1	High	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1586	1	Tier 2	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1586	1	Supermarket Type1	1586	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1586.0	NaN	NaN	NaN	3923.778802	904.62901	2300.56	3285.51	4139.645	4639.4	4997.63

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	1152	1152	NC2769	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	1152.0	NaN	NaN	NaN	9.911241	1.799846	4.0	8.7675	9.795	10.89	19.82
Product_Sugar_Content	1152	3	Low Sugar	658	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1152.0	NaN	NaN	NaN	0.067747	0.047567	0.006	0.031	0.0545	0.09525	0.292
Product_Type	1152	16	Fruits and Vegetables	168	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1152.0	NaN	NaN	NaN	107.080634	24.912333	31.0	92.8275	104.675	117.8175	224.93
Store_Id	1152	1	OUT002	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1152.0	NaN	NaN	NaN	1998.0	0.0	1998.0	1998.0	1998.0	1998.0	1998.0
Store_Size	1152	1	Small	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1152	1	Tier 3	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1152	1	Food Mart	1152	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1152.0	NaN	NaN	NaN	1762.942465	462.862431	33.0	1495.4725	1889.495	2133.6225	2299.63

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Product_Id	1349	1349	NC522	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Weight	1349.0	NaN	NaN	NaN	15.103692	1.893531	7.35	14.02	15.18	16.35	22.0
Product_Sugar_Content	1349	3	Low Sugar	750	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Allocated_Area	1349.0	NaN	NaN	NaN	0.068637	0.048708	0.004	0.031	0.057	0.094	0.298
Product_Type	1349	16	Snack Foods	186	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_MRP	1349.0	NaN	NaN	NaN	181.358725	24.796429	85.88	166.92	179.67	198.07	266.0
Store_Id	1349	1	OUT003	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Establishment_Year	1349.0	NaN	NaN	NaN	1999.0	0.0	1999.0	1999.0	1999.0	1999.0	1999.0
Store_Size	1349	1	Medium	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Location_City_Type	1349	1	Tier 1	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Store_Type	1349	1	Departmental Store	1349	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Product_Store_Sales_Total	1349.0	NaN	NaN	NaN	4946.966323	677.539953	3069.24	4355.39	4958.29	5366.59	8000.0

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
0	FD6114	12.66	Low Sugar	0.027	Frozen Foods	117.08	OUT004	2009	Medium	Tier 2	Supermarket Type2	2842.40
1	FD7839	16.54	Low Sugar	0.144	Dairy	171.43	OUT003	1999	Medium	Tier 1	Departmental Store	4830.02
2	FD5075	14.28	Regular	0.031	Canned	162.08	OUT001	1987	High	Tier 2	Supermarket Type1	4130.16
3	FD8233	12.10	Low Sugar	0.112	Baking Goods	186.31	OUT001	1987	High	Tier 2	Supermarket Type1	4132.18
4	NC1180	9.57	No Sugar	0.010	Health and Hygiene	123.67	OUT002	1998	Small	Tier 3	Food Mart	2279.36

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
8758	NC7546	14.80	No Sugar	0.016	Health and Hygiene	140.53	OUT004	2009	Medium	Tier 2	Supermarket Type2	3806.53
8759	NC584	14.06	No Sugar	0.142	Household	144.51	OUT004	2009	Medium	Tier 2	Supermarket Type2	5020.74
8760	NC2471	13.48	No Sugar	0.017	Health and Hygiene	88.58	OUT001	1987	High	Tier 2	Supermarket Type1	2443.42
8761	NC7187	13.89	No Sugar	0.193	Household	168.44	OUT001	1987	High	Tier 2	Supermarket Type1	4171.82
8762	FD306	14.73	Low Sugar	0.177	Snack Foods	224.93	OUT002	1998	Small	Tier 3	Food Mart	2186.08

	0
Product_Allocated_Area	1.128093
Product_Store_Sales_Total	0.092024
Product_MRP	0.036513
Product_Weight	0.017514
Store_Establishment_Year	-0.758061

	Random Forest Estimator	Random Forest Tuned	XGBoost	XGBoost Tuned
RMSE	106.523083	293.487667	136.324944	271.116684
MAE	40.007379	155.034843	62.233689	115.814980
R-squared	0.989994	0.924046	0.983612	0.935184
Adj. R-squared	0.989983	0.923959	0.983593	0.935110
MAPE	0.014839	0.055601	0.022171	0.045280

	Random Forest Estimator	Random Forest Tuned	XGBoost	XGBoost Tuned
RMSE	284.781996	315.238346	308.354445	297.858585
MAE	109.399474	168.180693	136.366693	129.831540
R-squared	0.928923	0.912907	0.916669	0.922245
Adj. R-squared	0.928597	0.912507	0.916287	0.921889
MAPE	0.039265	0.059355	0.049794	0.047845

	R-squared
Random Forest Estimator	0.061071
Random Forest Tuned	0.011139
XGBoost	0.066943
XGBoost Tuned	0.012939

Executive Summary¶

Problem Statement¶

Business Context¶

Objective¶

Data Description¶

Installing and Importing the necessary libraries¶

Connect to Google Drive¶

Loading the dataset¶

Data Overview¶

View the first and last 5 rows of the dataset¶

View the shape of the dataset¶

Dataset Schema and Null Values¶

Check for duplicate data¶

Check unique values per column¶

Data Cleansing for Product_Sugar_Content Prior to EDA¶

Value Counts for Categorical Columns¶

Statistical Summary of the Dataset¶

Exploratory Data Analysis (EDA)¶

Section: Univariate Analysis — Numerical Features¶

Product_Weight - Univariate Analysis¶

Product_Allocated_Area - Univariate Analysis¶

Product_MRP - Univivariate Analysis¶

Store_Establishment_Year - Univivariate Analysis¶

Section: Univariate Analysis — Categorical Features¶

Product_Sugar_Content¶

Product_Type - Univariate Analysis¶

Store_Size - Univariate Analysis¶

Store_Location_City_Type - Univariate Analysis¶

Store_Type - Univariate Analysis¶

Section: Bivariate Analysis¶

Visualizing Relationships Between Numeric Features and Sales¶

List of numerical features in the dataset (excluding ID columns)¶

Checking the distribution of our target variable against categorical variables for additional insight.¶

List of Categorical features in the dataset (excluding ID columns)¶

Define a utility function for our Boxplots and Barplots for our categorical features¶

- Display Which Store Types Drive the Highest Sales Revenue?¶

- Display Which Store Sizes Drive the Highest Sales Revenue?¶

- Display Which Store Location City Types Drive the Highest Sales Revenue?¶

- Display Which Product Types Drive the Highest Sales Revenue?¶

- Display Whether Product_Sugar_Content Drives Sales Revenue?¶

- Display Which Store Drives the Most Sales Revenue?¶

- Let's view the details of each store sorted by average total sales by store?¶

- Let's view the total sales revenue by Product Type per Store¶

- Let's view the same data in a Heatmap¶

Let's do a deeper analysis of each store¶

Store OUT001¶

Store OUT002¶

Store OUT003¶

Store OUT004¶

Final Data Checks¶

1. Check for Class Imbalance (Categorical Features)¶

2. Check for Outliers (Numerical Features)¶

3. Re-Check for any missing data¶

4. Skewness Check for Target and Predictors¶

Data Preprocessing¶

Transform Product_Allocated_Area to adjust for skewness¶

Transform Product_Sugar_Content column¶

Address the high number of Product Types (16) to see if we can group them.¶

Data Preparation for Modeling¶

Drop Unnecessary Columns¶

Separating Features and Target¶

Train-Test Split (80:20)¶

Data Pre-processing Pipeline¶

Model Building¶

Define functions for Model Evaluation¶

Random Forest Regressor - Model Training Pipeline - Base Model¶

Create a Pipeline with Preprocessing and Model¶

Fit the Pipeline¶

Evaluate Model Performance on Training and Test¶

Random Forest Regressor - Hyperparameter Tuning¶

Evaluate Model Performance on Training and Test for the Hypertuned Random Forest Model¶

XGBoost Regressor - Model Training Pipeline¶

Create a Pipeline with Preprocessing and Model¶

Fit the Pipeline¶

Evaluate Model Performance on Training and Test¶

XGBoost Regressor - Hyperparameter Tuning¶

Evaluate Model Performance on Training and Test for the Hypertuned XGBoost Model¶

Model Performance Comparison and Final Model Selection¶

Model Serialization & Sanity Check¶

Deployment - Backend¶

Data Cleansing for `Product_Sugar_Content` Prior to EDA¶

- Display Whether `Product_Sugar_Content` Drives Sales Revenue?¶