root_clusters = clusters.copy()while len(root_clusters) > max_clusters: # Embed root clusters # Cluster them # Generate parent clusters for each group # Update root_clusters to be the new parents new_current_level = await model.reduce_clusters(root_clusters) root_clusters = [c for c in new_current_level if c.parent_id is None] all_clusters.extend(new_current_level)
This continues until the desired number of root clusters is reached.
The LLM’s response is validated with fuzzy matching (90% similarity threshold):
from thefuzz import fuzz# If LLM returns a close but not exact match, accept itfor candidate in candidate_clusters: similarity = fuzz.ratio(llm_response, candidate) if similarity >= 90: return candidate # Accept the match
This handles small variations like:
“Troubleshoot programming errors” vs “Troubleshoot Programming Errors”
“Debug frontend frameworks” vs “Debug front-end frameworks”
From kura/meta_cluster.py:386-444, parent clusters are named:
async def rename_cluster_group(self, clusters: list[Cluster]) -> list[Cluster]: # Prompt: "Summarize this group of cluster names into a short, # precise description and name" resp = await self.client.chat.completions.create( messages=[{"role": "user", "content": prompt}], response_model=GeneratedCluster, context={"clusters": clusters} ) # Create parent cluster parent = Cluster( name=resp.name, description=resp.summary, slug=resp.slug, chat_ids=[id for c in clusters for id in c.chat_ids], parent_id=None ) # Update children to point to parent children = [ Cluster(..., parent_id=parent.id) for cluster in clusters ] return [parent, *children]
from kura.meta_cluster import reduce_clusters_from_base_clusters, MetaClusterModelfrom kura.checkpoints import JSONLCheckpointManagerfrom rich.console import Console# Configure meta-clusteringconsole = Console()meta_model = MetaClusterModel( model="openai/gpt-4o-mini", max_concurrent_requests=50, max_clusters=8, # Reduce to 8 root clusters console=console)checkpoint_mgr = JSONLCheckpointManager("./checkpoints")# Reduce base clusters to hierarchymeta_clusters = await reduce_clusters_from_base_clusters( clusters=base_clusters, # e.g., 100 base clusters model=meta_model, checkpoint_manager=checkpoint_mgr)print(f"Total clusters: {len(meta_clusters)}")print(f"Root clusters: {len([c for c in meta_clusters if c.parent_id is None])}")# Print hierarchyroot_clusters = [c for c in meta_clusters if c.parent_id is None]for root in root_clusters: print(f"\n{root.name} ({root.count} conversations)") children = [c for c in meta_clusters if c.parent_id == root.id] for child in children: print(f" └─ {child.name} ({child.count})")
def get_children(cluster_id: str, all_clusters: list[Cluster]) -> list[Cluster]: return [c for c in all_clusters if c.parent_id == cluster_id]def get_root_clusters(all_clusters: list[Cluster]) -> list[Cluster]: return [c for c in all_clusters if c.parent_id is None]def get_leaf_clusters(all_clusters: list[Cluster]) -> list[Cluster]: cluster_ids = {c.id for c in all_clusters} parent_ids = {c.parent_id for c in all_clusters if c.parent_id} leaf_ids = cluster_ids - parent_ids return [c for c in all_clusters if c.id in leaf_ids]# Get all conversations under a cluster (including descendants)def get_all_conversations(cluster: Cluster, all_clusters: list[Cluster]) -> set[str]: chat_ids = set(cluster.chat_ids) for child in get_children(cluster.id, all_clusters): chat_ids.update(get_all_conversations(child, all_clusters)) return chat_ids
Meta-clustering quality depends on base cluster quality:
# Use descriptive base clusteringbase_clustering_model = ClusterDescriptionModel( model="openai/gpt-4o", # Higher quality than gpt-4o-mini temperature=0.2)
# Test with different max_clusters valuesfor max_c in [5, 10, 15, 20]: meta_model = MetaClusterModel(max_clusters=max_c) meta_clusters = await reduce_clusters_from_base_clusters( clusters=base_clusters[:50], # Subset for testing model=meta_model ) print(f"max_clusters={max_c}: {len([c for c in meta_clusters if c.parent_id is None])} roots")
def get_depth(cluster: Cluster, all_clusters: list[Cluster]) -> int: if cluster.parent_id is None: return 0 parent = next(c for c in all_clusters if c.id == cluster.parent_id) return 1 + get_depth(parent, all_clusters)max_depth = max(get_depth(c, meta_clusters) for c in meta_clusters)print(f"Hierarchy depth: {max_depth}")