Spaces:

MusoraProductDepartment
/

Sentiment_analysis

Sleeping

App Files Files Community

Sentiment_analysis / visualization /utils /learning_paths_utils.py

Danialebrat

Adding Learning path page and improving HelpScout dashboard

599973c 9 days ago

raw

history blame contribute delete

8.1 kB

	"""
	Learning Paths utility helpers — pure functions, no Streamlit dependency.
	"""
	import json
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple

	import pandas as pd


	# ── Config helpers ─────────────────────────────────────────────────────────────

	def load_lp_config(config_path: str = None) -> dict:
	if config_path is None:
	config_path = Path(__file__).resolve().parent.parent / "config" / "viz_config.json"
	with open(config_path) as f:
	return json.load(f).get("learning_paths", {})


	def get_brands(config: dict) -> List[str]:
	return config.get("brands", [])


	def get_brand_color(brand: str, config: dict) -> str:
	return config.get("brand_colors", {}).get(brand, "#607D8B")


	def label_for_path(path_id, config: dict) -> str:
	return config.get("path_labels", {}).get(str(path_id), f"Path {path_id}")


	# ── DataFrame merge helpers ────────────────────────────────────────────────────

	def merge_lesson_metrics(
	lesson_map: pd.DataFrame,
	per_path_df: pd.DataFrame,
	video_df: pd.DataFrame,
	sentiment_df: pd.DataFrame,
	) -> pd.DataFrame:
	"""
	Join all lesson-level metric DataFrames into one tidy frame indexed by
	(learning_path_id, lesson_order). Returns an empty frame if lesson_map is empty.
	"""
	if lesson_map.empty:
	return pd.DataFrame()

	base = lesson_map[["brand", "learning_path_id", "first_lesson_content_id",
	"lesson_order", "lesson_content_id", "content_title"]].copy()

	join_key = ["learning_path_id", "lesson_content_id"]

	if not per_path_df.empty and "content_id" in per_path_df.columns:
	pp = per_path_df.rename(columns={"content_id": "lesson_content_id"})
	cols = ["lesson_content_id", "learning_path_id", "lesson_number",
	"students_completed", "denominator_students", "completion_rate"]
	cols = [c for c in cols if c in pp.columns]
	base = base.merge(pp[cols], on=join_key, how="left")

	if not video_df.empty and "content_id" in video_df.columns:
	vd = video_df.rename(columns={"content_id": "lesson_content_id"})
	cols = ["lesson_content_id", "learning_path_id",
	"total_starts", "total_completions", "video_completion_rate"]
	cols = [c for c in cols if c in vd.columns]
	base = base.merge(vd[cols], on=join_key, how="left")

	if not sentiment_df.empty:
	sent_key = ["learning_path_id", "lesson_order"]
	sent_cols = [c for c in [
	"learning_path_id", "lesson_order",
	"total_comments", "very_positive", "positive", "neutral",
	"negative", "very_negative", "avg_sentiment_score",
	] if c in sentiment_df.columns]
	base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left")

	# Fill numeric nulls with 0 / NaN as appropriate
	for col in ["students_completed", "denominator_students", "total_starts",
	"total_completions", "total_comments",
	"very_positive", "positive", "neutral", "negative", "very_negative"]:
	if col in base.columns:
	base[col] = base[col].fillna(0).astype(int)

	base.sort_values(["learning_path_id", "lesson_order"], inplace=True)
	return base.reset_index(drop=True)


	def merge_method_wide(
	method_df: pd.DataFrame,
	video_df: pd.DataFrame,
	sentiment_df: pd.DataFrame,
	config: dict,
	) -> pd.DataFrame:
	"""Same as merge_lesson_metrics but uses method-wide completion and
	adds method_lesson_number as the continuous x-axis."""
	if method_df.empty:
	return pd.DataFrame()

	base = method_df.rename(columns={"content_id": "lesson_content_id"}).copy()
	join_key = ["learning_path_id", "lesson_content_id"]

	if not video_df.empty and "content_id" in video_df.columns:
	vd = video_df.rename(columns={"content_id": "lesson_content_id"})
	cols = [c for c in ["lesson_content_id", "learning_path_id",
	"total_starts", "total_completions",
	"video_completion_rate"] if c in vd.columns]
	base = base.merge(vd[cols], on=join_key, how="left")

	if not sentiment_df.empty and "lesson_order" in base.columns:
	sent_key = ["learning_path_id", "lesson_order"]
	sent_cols = [c for c in [
	"learning_path_id", "lesson_order",
	"total_comments", "very_positive", "positive", "neutral",
	"negative", "very_negative", "avg_sentiment_score",
	] if c in sentiment_df.columns]
	base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left")

	# Add path label
	base["path_label"] = base["learning_path_id"].apply(
	lambda pid: label_for_path(pid, config)
	)

	for col in ["students_completed", "total_starts", "total_completions",
	"total_comments", "very_positive", "positive", "neutral",
	"negative", "very_negative"]:
	if col in base.columns:
	base[col] = base[col].fillna(0).astype(int)

	base.sort_values("method_lesson_number", inplace=True)
	return base.reset_index(drop=True)


	# ── Analysis helpers ───────────────────────────────────────────────────────────

	def find_top_dropoffs(df: pd.DataFrame, n: int = 5,
	rate_col: str = "completion_rate",
	order_col: str = "lesson_order") -> pd.DataFrame:
	"""
	Return the top-N lessons with the largest completion-rate drop
	compared to the previous lesson (within the same learning_path_id).
	"""
	if df.empty or rate_col not in df.columns:
	return pd.DataFrame()

	result = df.copy().sort_values(["learning_path_id", order_col])
	result["prev_rate"] = result.groupby("learning_path_id")[rate_col].shift(1)
	result["dropoff"] = result["prev_rate"] - result[rate_col]
	result = result[result["dropoff"].notna() & (result["dropoff"] > 0)]
	return result.nlargest(n, "dropoff")[
	[c for c in ["learning_path_id", order_col, "content_title",
	"prev_rate", rate_col, "dropoff"] if c in result.columns]
	].reset_index(drop=True)


	def get_overview_kpis(merged: pd.DataFrame) -> dict:
	"""Return a dict of high-level KPI values from the merged metrics frame."""
	if merged.empty:
	return {}

	total_students = int(merged["denominator_students"].max()) if "denominator_students" in merged.columns else 0
	avg_completion = float(merged["completion_rate"].mean()) if "completion_rate" in merged.columns else 0.0
	avg_sentiment = float(merged["avg_sentiment_score"].mean()) if "avg_sentiment_score" in merged.columns else 0.0
	total_comments = int(merged["total_comments"].sum()) if "total_comments" in merged.columns else 0
	n_paths = merged["learning_path_id"].nunique() if "learning_path_id" in merged.columns else 0
	n_lessons = len(merged)

	return {
	"total_students": total_students,
	"avg_completion_pct": avg_completion * 100,
	"avg_sentiment_score": avg_sentiment,
	"total_comments": total_comments,
	"n_paths": n_paths,
	"n_lessons": n_lessons,
	}


	def filter_by_paths(df: pd.DataFrame,
	path_ids: Optional[List[int]]) -> pd.DataFrame:
	"""Filter df to a subset of learning_path_ids. None or empty = all."""
	if not path_ids or df.empty or "learning_path_id" not in df.columns:
	return df
	return df[df["learning_path_id"].isin(path_ids)].reset_index(drop=True)


	def short_title(title: Optional[str], max_len: int = 35) -> str:
	"""Truncate a content title for display in labels."""
	if not title or pd.isna(title):
	return "—"
	t = str(title).strip()
	return t if len(t) <= max_len else t[:max_len] + "…"