Sentiment_analysis / visualization /utils /learning_paths_utils.py
Danialebrat's picture
Adding Learning path page and improving HelpScout dashboard
599973c
"""
Learning Paths utility helpers β€” pure functions, no Streamlit dependency.
"""
import json
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pandas as pd
# ── Config helpers ─────────────────────────────────────────────────────────────
def load_lp_config(config_path: str = None) -> dict:
if config_path is None:
config_path = Path(__file__).resolve().parent.parent / "config" / "viz_config.json"
with open(config_path) as f:
return json.load(f).get("learning_paths", {})
def get_brands(config: dict) -> List[str]:
return config.get("brands", [])
def get_brand_color(brand: str, config: dict) -> str:
return config.get("brand_colors", {}).get(brand, "#607D8B")
def label_for_path(path_id, config: dict) -> str:
return config.get("path_labels", {}).get(str(path_id), f"Path {path_id}")
# ── DataFrame merge helpers ────────────────────────────────────────────────────
def merge_lesson_metrics(
lesson_map: pd.DataFrame,
per_path_df: pd.DataFrame,
video_df: pd.DataFrame,
sentiment_df: pd.DataFrame,
) -> pd.DataFrame:
"""
Join all lesson-level metric DataFrames into one tidy frame indexed by
(learning_path_id, lesson_order). Returns an empty frame if lesson_map is empty.
"""
if lesson_map.empty:
return pd.DataFrame()
base = lesson_map[["brand", "learning_path_id", "first_lesson_content_id",
"lesson_order", "lesson_content_id", "content_title"]].copy()
join_key = ["learning_path_id", "lesson_content_id"]
if not per_path_df.empty and "content_id" in per_path_df.columns:
pp = per_path_df.rename(columns={"content_id": "lesson_content_id"})
cols = ["lesson_content_id", "learning_path_id", "lesson_number",
"students_completed", "denominator_students", "completion_rate"]
cols = [c for c in cols if c in pp.columns]
base = base.merge(pp[cols], on=join_key, how="left")
if not video_df.empty and "content_id" in video_df.columns:
vd = video_df.rename(columns={"content_id": "lesson_content_id"})
cols = ["lesson_content_id", "learning_path_id",
"total_starts", "total_completions", "video_completion_rate"]
cols = [c for c in cols if c in vd.columns]
base = base.merge(vd[cols], on=join_key, how="left")
if not sentiment_df.empty:
sent_key = ["learning_path_id", "lesson_order"]
sent_cols = [c for c in [
"learning_path_id", "lesson_order",
"total_comments", "very_positive", "positive", "neutral",
"negative", "very_negative", "avg_sentiment_score",
] if c in sentiment_df.columns]
base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left")
# Fill numeric nulls with 0 / NaN as appropriate
for col in ["students_completed", "denominator_students", "total_starts",
"total_completions", "total_comments",
"very_positive", "positive", "neutral", "negative", "very_negative"]:
if col in base.columns:
base[col] = base[col].fillna(0).astype(int)
base.sort_values(["learning_path_id", "lesson_order"], inplace=True)
return base.reset_index(drop=True)
def merge_method_wide(
method_df: pd.DataFrame,
video_df: pd.DataFrame,
sentiment_df: pd.DataFrame,
config: dict,
) -> pd.DataFrame:
"""Same as merge_lesson_metrics but uses method-wide completion and
adds method_lesson_number as the continuous x-axis."""
if method_df.empty:
return pd.DataFrame()
base = method_df.rename(columns={"content_id": "lesson_content_id"}).copy()
join_key = ["learning_path_id", "lesson_content_id"]
if not video_df.empty and "content_id" in video_df.columns:
vd = video_df.rename(columns={"content_id": "lesson_content_id"})
cols = [c for c in ["lesson_content_id", "learning_path_id",
"total_starts", "total_completions",
"video_completion_rate"] if c in vd.columns]
base = base.merge(vd[cols], on=join_key, how="left")
if not sentiment_df.empty and "lesson_order" in base.columns:
sent_key = ["learning_path_id", "lesson_order"]
sent_cols = [c for c in [
"learning_path_id", "lesson_order",
"total_comments", "very_positive", "positive", "neutral",
"negative", "very_negative", "avg_sentiment_score",
] if c in sentiment_df.columns]
base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left")
# Add path label
base["path_label"] = base["learning_path_id"].apply(
lambda pid: label_for_path(pid, config)
)
for col in ["students_completed", "total_starts", "total_completions",
"total_comments", "very_positive", "positive", "neutral",
"negative", "very_negative"]:
if col in base.columns:
base[col] = base[col].fillna(0).astype(int)
base.sort_values("method_lesson_number", inplace=True)
return base.reset_index(drop=True)
# ── Analysis helpers ───────────────────────────────────────────────────────────
def find_top_dropoffs(df: pd.DataFrame, n: int = 5,
rate_col: str = "completion_rate",
order_col: str = "lesson_order") -> pd.DataFrame:
"""
Return the top-N lessons with the largest completion-rate drop
compared to the previous lesson (within the same learning_path_id).
"""
if df.empty or rate_col not in df.columns:
return pd.DataFrame()
result = df.copy().sort_values(["learning_path_id", order_col])
result["prev_rate"] = result.groupby("learning_path_id")[rate_col].shift(1)
result["dropoff"] = result["prev_rate"] - result[rate_col]
result = result[result["dropoff"].notna() & (result["dropoff"] > 0)]
return result.nlargest(n, "dropoff")[
[c for c in ["learning_path_id", order_col, "content_title",
"prev_rate", rate_col, "dropoff"] if c in result.columns]
].reset_index(drop=True)
def get_overview_kpis(merged: pd.DataFrame) -> dict:
"""Return a dict of high-level KPI values from the merged metrics frame."""
if merged.empty:
return {}
total_students = int(merged["denominator_students"].max()) if "denominator_students" in merged.columns else 0
avg_completion = float(merged["completion_rate"].mean()) if "completion_rate" in merged.columns else 0.0
avg_sentiment = float(merged["avg_sentiment_score"].mean()) if "avg_sentiment_score" in merged.columns else 0.0
total_comments = int(merged["total_comments"].sum()) if "total_comments" in merged.columns else 0
n_paths = merged["learning_path_id"].nunique() if "learning_path_id" in merged.columns else 0
n_lessons = len(merged)
return {
"total_students": total_students,
"avg_completion_pct": avg_completion * 100,
"avg_sentiment_score": avg_sentiment,
"total_comments": total_comments,
"n_paths": n_paths,
"n_lessons": n_lessons,
}
def filter_by_paths(df: pd.DataFrame,
path_ids: Optional[List[int]]) -> pd.DataFrame:
"""Filter df to a subset of learning_path_ids. None or empty = all."""
if not path_ids or df.empty or "learning_path_id" not in df.columns:
return df
return df[df["learning_path_id"].isin(path_ids)].reset_index(drop=True)
def short_title(title: Optional[str], max_len: int = 35) -> str:
"""Truncate a content title for display in labels."""
if not title or pd.isna(title):
return "β€”"
t = str(title).strip()
return t if len(t) <= max_len else t[:max_len] + "…"