Demo Streamlit of the Machine Learning Model

Por Jose R. Zapata

Ultima actualización: 8/Mayo/2025

In the field of data science, effectively communicating insights and results is as crucial as the analysis itself. A well-crafted demo allows stakeholders, including non-technical users, to interact with machine learning models and data-driven insights in an intuitive manner.

There are different Python-based frameworks for this purpose like: Streamlit, Taipy, Gradio, among others. These tools provide a simple way to create interactive web applications that showcase the core functionalities of a data science project.

Creating a demo is a step in bridging the gap between machine learning models and real-world usability. It fosters engagement, improves interpretability, and ensures that stakeholders can derive meaningful insights from data science projects. By leveraging interface tools, data scientists can transform complex analyses into user-friendly applications that drive impact and decision-making.

import os

import pandas as pd
import streamlit as st
from joblib import load
from sklearn.pipeline import Pipeline

# https://docs.streamlit.io/library/api-reference

# HOW TO RUN THE APP:
# streamlit run notebooks/7-deploy/titanic-streamlit-batch.py


def get_user_data() -> pd.DataFrame:
    """
    Get the data provided by the user. Preprocess the data and create a
    DataFrame to feed the model and make the prediction.

    :return: preprocessed user information from the app
    """
    user_data = {}

    col_a, col_b = st.columns(2)
    with col_a:
        user_data["age"] = st.number_input(
            label="Age:", min_value=0, max_value=100, value=20, step=1
        )
        user_data["sibsp"] = st.slider(
            label="Number of siblings and spouses aboard:",
            min_value=0,
            max_value=15,
            value=3,
            step=1,
        )
    with col_b:
        user_data["fare"] = st.number_input(
            label="How much did your ticket cost you?:",
            min_value=0,
            max_value=300,
            value=80,
            step=1,
        )
        user_data["parch"] = st.slider(
            label="Number of parents and children aboard:",
            min_value=0,
            max_value=15,
            value=3,
            step=1,
        )

    col1, col2, col3 = st.columns(3)

    with col1:
        user_data["pclass"] = st.radio(
            label="Ticket class:", options=["1st", "2nd", "3rd"], horizontal=False
        )
    with col2:
        user_data["sex"] = st.radio(label="Sex:", options=["Woman", "Man"], horizontal=False)
    with col3:
        user_data["embarked"] = st.radio(
            label="Port of Embarkation:",  # hidden
            options=["Cherbourg", "Queenstown", "Southampton"],
            index=1,
        )

    df = pd.DataFrame.from_dict(user_data, orient="index").T

    # some preprocessing of the raw data from the user.
    # Follow the same data structure than in the Kaggle competition
    df["sex"] = df["sex"].map({"Man": "male", "Woman": "female"})
    df["pclass"] = df["pclass"].map({"1st": 1, "2nd": 2, "3rd": 3})
    df["embarked"] = df["embarked"].map(
        {
            "Cherbourg": "C",
            "Queenstown": "Q",
            "Southampton": "S",
        }
    )

    return df


def preprocess_batch_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess batch data from a CSV file to match the format expected by the model.

    Args:
        df (pd.DataFrame): The original dataframe from CSV

    Returns:
        pd.DataFrame: Preprocessed dataframe ready for prediction
    """
    # Create a copy to avoid modifying the original
    processed_df = df.copy()

    # Convert sex values if needed
    if "sex" in processed_df.columns:
        processed_df["sex"] = processed_df["sex"].map(
            lambda x: "male"
            if x.lower() in ["man", "male", "m"]
            else "female"
            if x.lower() in ["woman", "female", "f"]
            else x
        )

    # Convert pclass values if needed
    if "pclass" in processed_df.columns and processed_df["pclass"].dtype == "object":
        pclass_mapping = {
            "1st": 1,
            "2nd": 2,
            "3rd": 3,
            "first": 1,
            "second": 2,
            "third": 3,
            "1": 1,
            "2": 2,
            "3": 3,
        }
        processed_df["pclass"] = processed_df["pclass"].map(
            lambda x: pclass_mapping.get(str(x).lower(), x)
        )
        processed_df["pclass"] = pd.to_numeric(processed_df["pclass"], errors="coerce")

    # Convert embarked values if needed
    if "embarked" in processed_df.columns:
        embarked_mapping = {
            "cherbourg": "C",
            "queenstown": "Q",
            "southampton": "S",
            "c": "C",
            "q": "Q",
            "s": "S",
        }
        processed_df["embarked"] = processed_df["embarked"].map(
            lambda x: embarked_mapping.get(str(x).lower(), x)
        )

    # Ensure required columns exist and have numeric types
    numeric_columns = ["age", "sibsp", "parch", "fare"]
    for col in numeric_columns:
        if col in processed_df.columns:
            processed_df[col] = pd.to_numeric(processed_df[col], errors="coerce")

    return processed_df


@st.cache_resource
def load_model(model_file_path: str) -> Pipeline:
    """
    Loads a model in joblib format (.joblib extension) from the '/models' directory.

    Args:
        model_file_path (str): The path where the trained model is stored in pickle format.

    Returns:
        Pipeline: The trained model, a scikit-learn Pipeline object.
    """

    with st.spinner("Loading model..."):
        model = load(model_file_path)

    return model


def individual_prediction_tab(model: Pipeline) -> None:
    """
    Display the individual prediction interface and make prediction for a single person.

    Args:
        model (Pipeline): The trained model
    """
    # get the data from the user
    df_user_data = get_user_data()

    # predict the outcome for the given user data
    state = model.predict(df_user_data)[0]

    emojis = ["😕", "😀"]

    st.write("")
    st.title(f"Chance to survive! {emojis[state]}")
    if state == 0:
        st.error("Bad news my friend, you will be food for sharks! 🦈")
    else:
        st.success("Congratulations! You can rest assured, you will be fine! 🤩")


def batch_prediction_tab(model: Pipeline) -> None:
    """
    Display the batch prediction interface and make predictions for a CSV file.

    Args:
        model (Pipeline): The trained model
    """
    st.subheader("Upload your CSV file with passenger data")

    # File uploader
    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

    if uploaded_file is not None:
        # Load and display the data
        try:
            df = pd.read_csv(uploaded_file)
            st.write("Preview of uploaded data:")
            st.dataframe(df.head())

            # Check if required columns exist
            required_cols = [
                "pclass",
                "sex",
                "age",
                "sibsp",
                "parch",
                "fare",
                "embarked",
            ]
            missing_cols = [col for col in required_cols if col not in df.columns]

            if missing_cols:
                st.warning(
                    f"Warning: Your data is missing these columns: {', '.join(missing_cols)}"
                )
                st.info("Required columns: pclass, sex, age, sibsp, parch, fare, embarked")
            # Add a button to make predictions
            elif st.button("Predict Survival"):
                with st.spinner("Processing data and making predictions..."):
                    # Make predictions
                    predictions = model.predict(df)

                    # Add predictions to the dataframe
                    result_df = df.copy()
                    result_df["Predicted_Survival"] = predictions
                    result_df["Survival_Status"] = result_df["Predicted_Survival"].map(
                        {
                            0: "Did not survive",
                            1: "Survived",
                        }
                    )

                    # Display results
                    st.success("Predictions completed!")
                    st.subheader("Prediction Results")
                    st.dataframe(result_df)

                    # Calculate survival rate
                    survival_rate = predictions.mean() * 100
                    st.metric("Overall Survival Rate", f"{survival_rate:.2f}%")

                    # Option to download results
                    csv = result_df.to_csv(index=False)
                    st.download_button(
                        label="Download results as CSV",
                        data=csv,
                        file_name="titanic_predictions.csv",
                        mime="text/csv",
                    )
        except Exception as e:
            st.error(f"Error processing the file: {e}")
            st.info("Please make sure your CSV file is properly formatted.")
    else:
        st.info("Please upload a CSV file with passenger information.")

        # Show sample format
        st.subheader("Sample CSV format:")
        sample_data = pd.DataFrame(
            {
                "pclass": [1, 2, 3],
                "sex": ["female", "male", "female"],
                "age": [29, 35, 15],
                "sibsp": [0, 1, 0],
                "parch": [0, 0, 1],
                "fare": [211.3, 26.0, 7.75],
                "embarked": ["S", "C", "Q"],
            }
        )
        st.dataframe(sample_data)


def main() -> None:
    # choose the trained model you want to use to make predictions
    model_name = "titanic_classification-random_forest-v1.joblib"

    # get the project file name: "<your_project_path>/titanic_streamlit"
    this_file_path = os.path.abspath(__file__)
    project_path = "/".join(this_file_path.split("/")[:-3])

    # display an image of the Titanic
    st.image("notebooks/7-deploy/images/RMS_Titanic.jpg")

    # title
    st.header(body="Would you have survived the Titanic?🚢")

    # load the model
    model = load_model(model_file_path=project_path + "/models/" + model_name)

    # Create tabs for individual and batch prediction
    tab1, tab2 = st.tabs(["Individual Prediction", "Batch Prediction"])

    with tab1:
        individual_prediction_tab(model)

    with tab2:
        batch_prediction_tab(model)


if __name__ == "__main__":
    main()

📖 References