Dimensionality reduction transforms high-dimensional cluster embeddings (typically 1536 dimensions) into 2D coordinates for visualization. This allows users to explore clusters spatially, where proximity indicates semantic similarity.Kura uses UMAP (Uniform Manifold Approximation and Projection), which preserves both local and global structure better than alternatives like t-SNE or PCA.
The output extends the base Cluster with 2D coordinates:
class ProjectedCluster(Cluster): x_coord: float # X position in 2D space y_coord: float # Y position in 2D space level: int # Hierarchy depth (0 = root)
# Convert cluster name + description to texttexts_to_embed = [str(c) for c in clusters]# Embed with the specified modelcluster_embeddings = await self.embedding_model.embed(texts_to_embed)
This creates a high-dimensional representation of each cluster (typically 1536 dimensions for OpenAI).
root_clusters = [c for c in projected_clusters if c.level == 0]child_clusters = [c for c in projected_clusters if c.level == 1]print(f"Root clusters: {len(root_clusters)}")print(f"Child clusters: {len(child_clusters)}")
import matplotlib.pyplot as pltimport numpy as np# Extract coordinates and levelsx = [c.x_coord for c in projected_clusters]y = [c.y_coord for c in projected_clusters]levels = [c.level for c in projected_clusters]colors = plt.cm.viridis(np.array(levels) / max(levels))# Plotfig, ax = plt.subplots(figsize=(12, 8))scatter = ax.scatter(x, y, c=colors, s=100, alpha=0.6)# Label root clustersroot_clusters = [c for c in projected_clusters if c.level == 0]for cluster in root_clusters: ax.annotate( cluster.name[:30], # Truncate long names (cluster.x_coord, cluster.y_coord), fontsize=8, alpha=0.7 )ax.set_xlabel("UMAP Dimension 1")ax.set_ylabel("UMAP Dimension 2")ax.set_title("Cluster Visualization")plt.colorbar(scatter, label="Hierarchy Level")plt.tight_layout()plt.savefig("cluster_map.png", dpi=300)
import plotly.graph_objects as go# Prepare datax = [c.x_coord for c in projected_clusters]y = [c.y_coord for c in projected_clusters]names = [c.name for c in projected_clusters]levels = [c.level for c in projected_clusters]counts = [c.count for c in projected_clusters]# Create hover texthover_text = [ f"<b>{c.name}</b><br>" + f"Conversations: {c.count}<br>" + f"Level: {c.level}<br>" + f"Description: {c.description[:100]}..." for c in projected_clusters]# Create plotfig = go.Figure(data=[ go.Scatter( x=x, y=y, mode='markers', marker=dict( size=[np.sqrt(c) * 2 for c in counts], # Size by conversation count color=levels, # Color by hierarchy level colorscale='Viridis', showscale=True, colorbar=dict(title="Hierarchy Level"), line=dict(width=1, color='white') ), text=names, hovertext=hover_text, hoverinfo='text' )])fig.update_layout( title="Interactive Cluster Map", xaxis_title="UMAP Dimension 1", yaxis_title="UMAP Dimension 2", hovermode='closest', width=1200, height=800)fig.write_html("cluster_map.html")fig.show()
# Tighter clusters (emphasizes local structure)dim_model = HDBUMAP(min_dist=0.0)# More spread out (emphasizes global structure)dim_model = HDBUMAP(min_dist=0.5)
Visual comparison of min_dist values
min_dist=0.0: Clusters are very tight, points within clusters overlap
Project the final hierarchical clusters, not base clusters:
# Base clusters (too many to visualize clearly)base_clusters = await generate_base_clusters_from_conversation_summaries(...)# Meta-clusters (reduced to manageable number)meta_clusters = await reduce_clusters_from_base_clusters(...)# Project meta-clusters for visualizationprojected = await reduce_dimensionality_from_clusters( clusters=meta_clusters, # Use meta-clusters, not base model=dim_model)
# Show only root clusters initiallyroot_clusters = [c for c in projected_clusters if c.level == 0]# Expand to show children on clickdef get_children(cluster_id: str): return [c for c in projected_clusters if c.parent_id == cluster_id]
import matplotlib.pyplot as pltimport numpy as npsizes = [np.sqrt(c.count) * 10 for c in projected_clusters]plt.scatter( [c.x_coord for c in projected_clusters], [c.y_coord for c in projected_clusters], s=sizes, # Size by conversation count alpha=0.6)
# Color by concerning_score averageconcern_scores = [ np.mean([s.concerning_score for s in summaries if s.chat_id in c.chat_ids]) for c in projected_clusters]plt.scatter( [c.x_coord for c in projected_clusters], [c.y_coord for c in projected_clusters], c=concern_scores, cmap='RdYlGn_r', # Red = high concern, Green = low s=100)plt.colorbar(label="Average Concerning Score")