# Install required packages (uncomment if needed)
# !pip install pandas numpy matplotlib seaborn scikit-learn umap-learn statsmodels scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from umap import UMAP
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy.spatial.distance import cosine
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.figsize'] = (14, 6)
sns.set_style('whitegrid')

# Load the dataset
# Adjust the file path and format as needed (CSV, JSON, etc.)
# Expected columns: 'year', 'title', 'abstract'

DATA_PATH = "IEEE VIS papers 1990-2024 - Main dataset.csv"  # <-- ADJUST THIS PATH

try:
    df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"File not found at {DATA_PATH}. Please update DATA_PATH.")
    print("Creating synthetic demo data for workflow testing...")
    # Generate synthetic data for testing the workflow
    np.random.seed(42)
    n_papers = 500
    years = np.random.randint(1990, 2025, n_papers)
    sample_topics = [
        "We present a novel graph visualization technique for exploring network structures using node-link diagrams and adjacency matrices.",
        "This paper describes an interactive volume rendering method for medical imaging data using ray casting and transfer functions.",
        "We propose a machine learning approach for automatic classification of visualization types in scientific publications.",
        "A new method for visual analytics of temporal event sequences in electronic health records is presented.",
        "We introduce a collaborative immersive analytics system using virtual reality for data exploration and sensemaking.",
        "This work presents a technique for uncertainty visualization in ensemble weather forecast data using probabilistic models.",
        "We describe a text visualization system for topic modeling and document corpus exploration using natural language processing.",
        "A scalable parallel rendering algorithm for large-scale scientific simulation data is proposed for high performance computing.",
        "We present a user study evaluating the effectiveness of color encoding in multivariate data visualization dashboards.",
        "This paper introduces a deep learning method for automatic view recommendation in exploratory data analysis workflows."
    ]
    abstracts = [sample_topics[i % len(sample_topics)] + f" Extended discussion variant {i}." for i in range(n_papers)]
    titles = [f"Paper {i}: " + a.split('.')[0][:60] for i, a in enumerate(abstracts)]
    df = pd.DataFrame({'Year': years, 'Title': titles, 'Abstract': abstracts})

print(f"Dataset: {len(df)} papers, years {df['Year'].min()}–{df['Year'].max()}")
df.head()

Dataset: 3877 papers, years 1990–2024

# Prepare text: combine title and abstract for richer representation
df['text'] = df['Title'].fillna('') + '. ' + df['Abstract'].fillna('')
df = df.dropna(subset=['text']).reset_index(drop=True)
documents = df['text'].tolist()
print(f"Prepared {len(documents)} documents for analysis.")

Prepared 3877 documents for analysis.

# ═══════════════════════════════════════════════════════════
# SPECIFICATION: spec_model (adjust in Loop L1 if needed)
# ═══════════════════════════════════════════════════════════
spec_model = {
    'n_topics': 12,              # number of topics for NMF
    'max_df': 0.85,              # ignore terms appearing in >85% of docs
    'min_df': 5,                 # ignore terms appearing in <5 docs
    'ngram_range': (1, 2),       # unigrams and bigrams
    'max_features': 5000,        # vocabulary size limit
    'top_n_words': 10,           # words per topic representation
    'nmf_init': 'nndsvda',       # NMF initialisation method
    'nmf_max_iter': 500,         # NMF max iterations
}
# ═══════════════════════════════════════════════════════════

# T1: characterise — vectorise text using TF-IDF

print("Computing TF-IDF vectors...")
tfidf_vectorizer = TfidfVectorizer(
    max_df=spec_model['max_df'],
    min_df=spec_model['min_df'],
    ngram_range=spec_model['ngram_range'],
    max_features=spec_model['max_features'],
    stop_words='english'
)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"TF-IDF matrix shape: {tfidf_matrix.shape} (documents × terms)")
print(f"Vocabulary size: {len(feature_names)} terms")

Computing TF-IDF vectors...
TF-IDF matrix shape: (3877, 5000) (documents × terms)
Vocabulary size: 5000 terms

# T2: build-model — fit NMF topic model

print(f"Fitting NMF with {spec_model['n_topics']} topics...")
nmf_model = NMF(
    n_components=spec_model['n_topics'],
    init=spec_model['nmf_init'],
    max_iter=spec_model['nmf_max_iter'],
    random_state=42
)

# W matrix: documents × topics (document-topic weights)
doc_topic_matrix = nmf_model.fit_transform(tfidf_matrix)

# H matrix: topics × terms (topic-term weights)
topic_term_matrix = nmf_model.components_

reconstruction_error = nmf_model.reconstruction_err_
print(f"NMF reconstruction error: {reconstruction_error:.4f}")
print(f"Document-topic matrix shape: {doc_topic_matrix.shape}")

Fitting NMF with 12 topics...
NMF reconstruction error: 59.7422
Document-topic matrix shape: (3877, 12)

# T3: characterise — extract top terms per topic and assign papers

def get_top_terms(topic_term_matrix, feature_names, n_top=10):
    """Extract top-N terms for each topic."""
    topics_terms = {}
    for topic_idx, topic_vec in enumerate(topic_term_matrix):
        top_indices = topic_vec.argsort()[::-1][:n_top]
        top_terms = [(feature_names[i], topic_vec[i]) for i in top_indices]
        topics_terms[topic_idx] = top_terms
    return topics_terms

topics_terms = get_top_terms(topic_term_matrix, feature_names, n_top=spec_model['top_n_words'])

# Assign each paper to its dominant topic
df['topic'] = doc_topic_matrix.argmax(axis=1)
df['topic_weight'] = doc_topic_matrix.max(axis=1)

# Create topic labels from top terms
topic_labels_map = {}
print("\nDiscovered Topics:")
print("=" * 70)
for topic_id, terms in topics_terms.items():
    term_str = ', '.join([t[0] for t in terms[:5]])
    topic_labels_map[topic_id] = f"T{topic_id}: {term_str}"
    count = (df['topic'] == topic_id).sum()
    full_terms = ', '.join([t[0] for t in terms[:8]])
    print(f"  Topic {topic_id:2d} ({count:4d} papers): {full_terms}")

print(f"\nTotal topics: {spec_model['n_topics']}")

Discovered Topics:
======================================================================
  Topic  0 ( 320 papers): visual, analytics, visual analytics, analysis, users, user, support, analytic
  Topic  1 ( 349 papers): volume, rendering, volume rendering, transfer, ray, image, volumetric, hardware
  Topic  2 ( 179 papers): flow, flow visualization, fluid, vortex, visualization, unsteady, flows, particle
  Topic  3 ( 791 papers): visualization, design, visualizations, information, information visualization, study, research, user
  Topic  4 ( 708 papers): data, sets, dimensional, analysis, data sets, multivariate, exploration, high dimensional
  Topic  5 ( 227 papers): graph, layout, graphs, node, tree, nodes, layouts, trees
  Topic  6 ( 134 papers): network, networks, social, traffic, challenge, social networks, social network, neural
  Topic  7 ( 249 papers): time, time series, series, time varying, temporal, varying, real time, real
  Topic  8 ( 259 papers): model, models, learning, machine, deep, machine learning, simulation, deep learning
  Topic  9 (  85 papers): uncertainty, uncertainty visualization, ensemble, uncertain, plots, uncertainty visualizations, visualizations, statistical
  Topic 10 ( 180 papers): vector, fields, tensor, field, vector fields, vector field, scalar, 3d
  Topic 11 ( 396 papers): surface, surfaces, mesh, algorithm, meshes, method, shape, simplification

Total topics: 12

# Topic size overview
topic_sizes = df.groupby('topic').size().sort_values(ascending=False)
print("\nTopic Size Distribution:")
print(topic_sizes.to_string())
print(f"\nSmallest topic: {topic_sizes.min()} papers")
print(f"Largest topic: {topic_sizes.max()} papers")
print(f"Median topic size: {topic_sizes.median():.0f} papers")

Topic Size Distribution:
topic
3     791
4     708
11    396
1     349
0     320
8     259
7     249
5     227
10    180
2     179
6     134
9      85

Smallest topic: 85 papers
Largest topic: 791 papers
Median topic size: 254 papers

# T4: contextualise — project TF-IDF vectors to 2D using UMAP

print("Computing UMAP 2D projection (this may take a moment)...")
umap_2d = UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)
coords_2d = umap_2d.fit_transform(tfidf_matrix)
df['x'] = coords_2d[:, 0]
df['y'] = coords_2d[:, 1]
print("Done.")

Computing UMAP 2D projection (this may take a moment)...
Done.

# T5: visualise — document scatterplot coloured by topic + topic size bar chart

fig, axes = plt.subplots(1, 2, figsize=(18, 7))

n_topics = spec_model['n_topics']
colors_cmap = cm.get_cmap('tab20', n_topics)

# Left: Document map
ax = axes[0]
for t in range(n_topics):
    mask = df['topic'] == t
    ax.scatter(df.loc[mask, 'x'], df.loc[mask, 'y'],
               c=[colors_cmap(t)], s=12, alpha=0.6, label=f'T{t}')

ax.set_title('Document Map (UMAP projection, coloured by topic)', fontsize=12)
ax.set_xlabel('UMAP-1')
ax.set_ylabel('UMAP-2')
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=7, ncol=1)

# Right: Topic sizes with labels
ax2 = axes[1]
topic_counts_sorted = df.groupby('topic').size().sort_values(ascending=True)
bars = ax2.barh(range(len(topic_counts_sorted)), topic_counts_sorted.values,
                color=[colors_cmap(t) for t in topic_counts_sorted.index])
ax2.set_yticks(range(len(topic_counts_sorted)))
ylabels = [topic_labels_map.get(t, f'T{t}')[:45] for t in topic_counts_sorted.index]
ax2.set_yticklabels(ylabels, fontsize=8)
ax2.set_xlabel('Number of papers')
ax2.set_title('Topic Sizes', fontsize=12)

plt.tight_layout()
plt.show()

# Additional: Top terms bar chart per topic

n_cols = 4
n_rows = int(np.ceil(n_topics / n_cols))
fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 2.5 * n_rows))
axes_flat = axes.flatten()

for topic_id in range(n_topics):
    ax = axes_flat[topic_id]
    terms = topics_terms[topic_id][:8]
    words = [t[0] for t in terms][::-1]
    weights = [t[1] for t in terms][::-1]
    ax.barh(words, weights, color=colors_cmap(topic_id), alpha=0.8)
    ax.set_title(f'Topic {topic_id}', fontsize=9, fontweight='bold')
    ax.tick_params(axis='y', labelsize=7)
    ax.tick_params(axis='x', labelsize=7)

# Hide empty subplots
for j in range(n_topics, len(axes_flat)):
    axes_flat[j].set_visible(False)

fig.suptitle('Top Terms per Topic (NMF weights)', fontsize=13, y=1.01)
plt.tight_layout()
plt.show()

# ═══════════════════════════════════════════════════════════
# T6/T7: ASSESSMENT & LOOP L1 (Topic Refinement)
# ═══════════════════════════════════════════════════════════
# Set this flag based on your assessment:
topics_satisfactory = True  # Change to False to trigger refinement

if not topics_satisfactory:
    # ─── Suggestions for refinement ───
    # 1. Change n_topics: increase for finer granularity, decrease for coarser
    #    spec_model['n_topics'] = 15  # or 8, 10, 20...
    #
    # 2. Adjust vocabulary: change max_df, min_df, max_features
    #    spec_model['max_df'] = 0.8
    #    spec_model['min_df'] = 3
    #
    # 3. Add domain-specific stop words:
    #    Add terms like 'paper', 'method', 'approach', 'propose' to stop words
    #
    # After adjusting, re-run from the spec_model cell and T1.
    print("Please adjust spec_model parameters and re-run from T1.")
else:
    print("✓ Topics accepted. Proceeding to Phase 2.")

✓ Topics accepted. Proceeding to Phase 2.

# T8: characterise — aggregate topic membership per year
# Using soft assignments (NMF weights) for more nuanced temporal profiles

# Option A: Hard assignment (dominant topic per paper)
topic_year_counts = df.groupby(['Year', 'topic']).size().unstack(fill_value=0)

# Option B: Soft assignment (sum of NMF weights per year)
df_weights = pd.DataFrame(doc_topic_matrix, columns=range(n_topics))
df_weights['Year'] = df['Year'].values
topic_year_soft = df_weights.groupby('Year').sum()

# Use soft assignments for smoother, more representative profiles
topic_year_data = topic_year_soft.copy()

# Compute proportions (share of each topic per year)
topic_year_props = topic_year_data.div(topic_year_data.sum(axis=1), axis=0)

# Ensure all years in range are present
all_years = range(df['Year'].min(), df['Year'].max() + 1)
topic_year_props = topic_year_props.reindex(all_years, fill_value=0)
topic_year_counts = topic_year_counts.reindex(all_years, fill_value=0)

print(f"Temporal matrix: {topic_year_props.shape[0]} years × {topic_year_props.shape[1]} topics")
topic_year_props.head()

Temporal matrix: 35 years × 12 topics

# ═══════════════════════════════════════════════════════════
# SPECIFICATION: spec_smoothing (adjust in Loop L2 if needed)
# ═══════════════════════════════════════════════════════════
spec_smoothing = {
    'method': 'loess',          # 'moving_avg', 'loess', or 'gaussian'
    'window_size': 5,           # for moving_avg / gaussian: window in years
    'loess_frac': 0.15,         # for loess: fraction of data for local fit
    'use_proportions': True,    # True = show share; False = show absolute counts
}
# ═══════════════════════════════════════════════════════════

# T9: characterise — apply smoothing to get trends

data_for_smoothing = topic_year_props if spec_smoothing['use_proportions'] else topic_year_counts

def smooth_series(years, values, spec):
    """Apply smoothing according to specification."""
    if spec['method'] == 'moving_avg':
        s = pd.Series(values, index=years)
        return s.rolling(window=spec['window_size'], center=True, min_periods=1).mean().values
    elif spec['method'] == 'loess':
        smoothed = lowess(values, years, frac=spec['loess_frac'], return_sorted=True)
        return smoothed[:, 1]
    elif spec['method'] == 'gaussian':
        from scipy.ndimage import gaussian_filter1d
        sigma = spec['window_size'] / 2.0
        return gaussian_filter1d(values.astype(float), sigma=sigma)
    else:
        return values

# Apply smoothing to each topic
years_array = np.array(data_for_smoothing.index)
topic_trends = pd.DataFrame(index=data_for_smoothing.index)

for topic_id in data_for_smoothing.columns:
    values = data_for_smoothing[topic_id].values
    topic_trends[topic_id] = smooth_series(years_array, values, spec_smoothing)

# Clip negative values (can occur with some smoothing methods)
topic_trends = topic_trends.clip(lower=0)

print(f"Smoothed trends computed using '{spec_smoothing['method']}' method.")
topic_trends.head()

Smoothed trends computed using 'loess' method.

# T10/T11: Stacked area chart + individual trend lines

fig, axes = plt.subplots(2, 1, figsize=(16, 12))

# Stacked area chart (streamgraph-like)
ax = axes[0]
ax.stackplot(topic_trends.index,
             [topic_trends[t].values for t in topic_trends.columns],
             labels=[topic_labels_map.get(t, f'T{t}')[:35] for t in topic_trends.columns],
             colors=[colors_cmap(i) for i in range(n_topics)],
             alpha=0.8)
ax.set_title('Topic Evolution — Stacked Area Chart (smoothed proportions)', fontsize=13)
ax.set_xlabel('Year')
ax.set_ylabel('Proportion')
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=7)
ax.set_xlim(years_array.min(), years_array.max())

# Individual topic trend lines
ax2 = axes[1]
for i, t in enumerate(topic_trends.columns):
    ax2.plot(topic_trends.index, topic_trends[t].values,
             color=colors_cmap(i), linewidth=1.8, alpha=0.8,
             label=topic_labels_map.get(t, f'T{t}')[:35])
ax2.set_title('Individual Topic Trends (smoothed)', fontsize=13)
ax2.set_xlabel('Year')
ax2.set_ylabel('Proportion')
ax2.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=7)
ax2.set_xlim(years_array.min(), years_array.max())

plt.tight_layout()
plt.show()

# Small multiples: one subplot per topic for detailed inspection

n_cols = 4
n_rows = int(np.ceil(n_topics / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 3 * n_rows), sharex=True, sharey=True)
axes_flat = axes.flatten()

for i, t in enumerate(topic_trends.columns):
    ax = axes_flat[i]
    ax.fill_between(topic_trends.index, topic_trends[t].values,
                    alpha=0.3, color=colors_cmap(i))
    ax.plot(topic_trends.index, topic_trends[t].values,
            color=colors_cmap(i), linewidth=1.5)
    ax.set_title(topic_labels_map.get(t, f'T{t}')[:40], fontsize=8)
    ax.set_xlim(years_array.min(), years_array.max())

# Hide empty subplots
for j in range(len(topic_trends.columns), len(axes_flat)):
    axes_flat[j].set_visible(False)

fig.suptitle('Topic Trends — Small Multiples', fontsize=13, y=1.01)
plt.tight_layout()
plt.show()

# ═══════════════════════════════════════════════════════════
# T12/T13: ASSESSMENT & LOOP L2 (Smoothing Refinement)
# ═══════════════════════════════════════════════════════════
smoothing_satisfactory = True  # Change to False to try different smoothing

if not smoothing_satisfactory:
    # Adjust spec_smoothing above and re-run from T9 cell
    # Suggestions:
    #   - Increase loess_frac (e.g., 0.2, 0.25) for smoother trends
    #   - Decrease loess_frac (e.g., 0.1) for more detail
    #   - Try method='moving_avg' with window_size=3 or 7
    #   - Try method='gaussian' with window_size=5
    print("Please adjust spec_smoothing and re-run from T9.")
else:
    print("✓ Smoothing accepted. Proceeding to Phase 3.")

✓ Smoothing accepted. Proceeding to Phase 3.

# T14: abstract — identify rising, declining, and stable topics

def classify_trend(series, early_years=5, late_years=5):
    """Classify a topic trend as rising, declining, stable, or peaked."""
    values = series.values
    early_mean = values[:early_years].mean()
    late_mean = values[-late_years:].mean()
    overall_mean = values.mean()
    peak_idx = values.argmax()
    peak_position = peak_idx / len(values)  # 0=start, 1=end

    # Compute relative change
    denominator = max(early_mean, 0.001)  # avoid division by zero
    change_ratio = (late_mean - early_mean) / denominator

    if change_ratio > 0.5:
        return 'rising', change_ratio
    elif change_ratio < -0.4:
        return 'declining', change_ratio
    elif 0.25 < peak_position < 0.75 and values.max() > 1.8 * max(early_mean, late_mean):
        return 'peaked', change_ratio
    else:
        return 'stable', change_ratio

print("Topic Trend Classification:")
print("=" * 70)
trend_classifications = {}
for t in topic_trends.columns:
    trend_type, ratio = classify_trend(topic_trends[t])
    trend_classifications[t] = trend_type
    label = topic_labels_map.get(t, f'T{t}')
    print(f"  {label:<45} → {trend_type.upper():<10} (Δ={ratio:+.2f})")

print("\n--- Summary ---")
for category in ['rising', 'declining', 'stable', 'peaked']:
    topics_in_cat = [t for t, c in trend_classifications.items() if c == category]
    if topics_in_cat:
        print(f"  {category.upper()}: {len(topics_in_cat)} topics")

Topic Trend Classification:
======================================================================
  T0: visual, analytics, visual analytics, analysis, users → RISING     (Δ=+6.15)
  T1: volume, rendering, volume rendering, transfer, ray → DECLINING  (Δ=-0.82)
  T2: flow, flow visualization, fluid, vortex, visualization → DECLINING  (Δ=-0.79)
  T3: visualization, design, visualizations, information, information visualization → STABLE     (Δ=+0.13)
  T4: data, sets, dimensional, analysis, data sets → STABLE     (Δ=+0.02)
  T5: graph, layout, graphs, node, tree         → RISING     (Δ=+3.15)
  T6: network, networks, social, traffic, challenge → RISING     (Δ=+0.63)
  T7: time, time series, series, time varying, temporal → RISING     (Δ=+0.64)
  T8: model, models, learning, machine, deep    → RISING     (Δ=+1.43)
  T9: uncertainty, uncertainty visualization, ensemble, uncertain, plots → RISING     (Δ=+7.92)
  T10: vector, fields, tensor, field, vector fields → DECLINING  (Δ=-0.63)
  T11: surface, surfaces, mesh, algorithm, meshes → DECLINING  (Δ=-0.73)

--- Summary ---
  RISING: 6 topics
  DECLINING: 4 topics
  STABLE: 2 topics

# T15: define-unit — segment timeline into research eras
# Using cosine dissimilarity on topic composition between consecutive years

def detect_era_boundaries(topic_props, min_era_length=5):
    """Detect eras by finding years where topic composition changes significantly."""
    years = topic_props.index.values
    dissimilarities = []

    for i in range(1, len(years)):
        v1 = topic_props.iloc[i-1].values
        v2 = topic_props.iloc[i].values
        if v1.sum() > 0 and v2.sum() > 0:
            d = cosine(v1, v2)
        else:
            d = 0
        dissimilarities.append(d)

    dissimilarities = np.array(dissimilarities)

    # Find peaks in dissimilarity (potential era boundaries)
    threshold = np.percentile(dissimilarities, 75)
    boundaries = [years[0]]

    for i, d in enumerate(dissimilarities):
        year = years[i + 1]
        if d > threshold and (year - boundaries[-1]) >= min_era_length:
            boundaries.append(year)

    boundaries.append(years[-1] + 1)
    return boundaries, dissimilarities

era_boundaries, yearly_dissimilarity = detect_era_boundaries(topic_trends, min_era_length=5)
print(f"Detected era boundaries: {era_boundaries}")

# Create era labels
eras = []
for i in range(len(era_boundaries) - 1):
    start = era_boundaries[i]
    end = era_boundaries[i + 1] - 1
    eras.append({'start': start, 'end': end, 'label': f'Era {i+1} ({start}–{end})'})

print("\nIdentified Research Eras:")
for era in eras:
    # Find dominant topics in this era
    era_mask = (topic_trends.index >= era['start']) & (topic_trends.index <= era['end'])
    era_means = topic_trends[era_mask].mean()
    top_topics = era_means.nlargest(3).index.tolist()
    top_labels = [topic_labels_map.get(t, f'T{t}')[:40] for t in top_topics]
    print(f"  {era['label']}")
    for lbl in top_labels:
        print(f"    • {lbl}")

Detected era boundaries: [np.int64(1990), np.int64(1995), np.int64(2001), np.int64(2006), np.int64(2017), np.int64(2025)]

Identified Research Eras:
  Era 1 (1990–1994)
    • T3: visualization, design, visualization
    • T4: data, sets, dimensional, analysis, d
    • T1: volume, rendering, volume rendering,
  Era 2 (1995–2000)
    • T11: surface, surfaces, mesh, algorithm,
    • T3: visualization, design, visualization
    • T4: data, sets, dimensional, analysis, d
  Era 3 (2001–2005)
    • T11: surface, surfaces, mesh, algorithm,
    • T1: volume, rendering, volume rendering,
    • T4: data, sets, dimensional, analysis, d
  Era 4 (2006–2016)
    • T4: data, sets, dimensional, analysis, d
    • T3: visualization, design, visualization
    • T0: visual, analytics, visual analytics,
  Era 5 (2017–2024)
    • T3: visualization, design, visualization
    • T4: data, sets, dimensional, analysis, d
    • T8: model, models, learning, machine, de

# T16: visualise — annotated streamgraph with era boundaries

fig, ax = plt.subplots(figsize=(18, 7))

# Stacked area
ax.stackplot(topic_trends.index,
             [topic_trends[t].values for t in topic_trends.columns],
             labels=[topic_labels_map.get(t, f'T{t}')[:35] for t in topic_trends.columns],
             colors=[colors_cmap(i) for i in range(n_topics)],
             alpha=0.75)

# Era boundaries
for boundary in era_boundaries[1:-1]:
    ax.axvline(x=boundary, color='black', linestyle='--', linewidth=1.5, alpha=0.7)

# Era labels
for era in eras:
    mid = (era['start'] + era['end']) / 2
    ax.text(mid, ax.get_ylim()[1] * 0.95 if ax.get_ylim()[1] > 0 else 0.95,
            era['label'],
            ha='center', va='top', fontsize=9, fontweight='bold',
            bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7))

ax.set_title('IEEE VIS Research Topic Evolution (1990–2024) — Annotated with Research Eras',
             fontsize=13, fontweight='bold')
ax.set_xlabel('Year', fontsize=11)
ax.set_ylabel('Topic Proportion (smoothed)', fontsize=11)
ax.set_xlim(years_array.min(), years_array.max())
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=7)

plt.tight_layout()
plt.show()

# Heatmap view: topic intensity over time

fig, ax = plt.subplots(figsize=(16, max(4, n_topics * 0.5)))

heatmap_data = topic_trends.T.copy()
heatmap_data.index = [topic_labels_map.get(t, f'T{t}')[:40] for t in heatmap_data.index]

sns.heatmap(heatmap_data, cmap='YlOrRd', ax=ax, xticklabels=5,
            linewidths=0.1, cbar_kws={'label': 'Proportion'})
ax.set_title('Topic Intensity Heatmap (Year × Topic)', fontsize=13)
ax.set_xlabel('Year')
ax.set_ylabel('')

# Add era boundaries
for boundary in era_boundaries[1:-1]:
    if boundary in topic_trends.index:
        idx = list(topic_trends.index).index(boundary)
        ax.axvline(x=idx, color='blue', linestyle='--', linewidth=1.5, alpha=0.6)

plt.tight_layout()
plt.show()

# T17: generate-knowledge — structured summary of findings

print("=" * 80)
print("FINDINGS: Research Topic Evolution in IEEE VIS (1990–2024)")
print("=" * 80)

print(f"\n📊 Dataset: {len(df)} papers analysed")
print(f"📅 Time span: {df['Year'].min()}–{df['Year'].max()}")
print(f"🏷️  Topics discovered: {n_topics} (via NMF)")
print(f"📐 Smoothing: {spec_smoothing['method']} (frac={spec_smoothing.get('loess_frac', spec_smoothing.get('window_size'))})")

print("\n" + "-" * 80)
print("📈 RISING TOPICS (gaining prominence):")
print("-" * 80)
rising = [t for t, c in trend_classifications.items() if c == 'rising']
if rising:
    for t in rising:
        print(f"    • {topic_labels_map.get(t, f'T{t}')}")
else:
    print("    (none detected)")

print("\n" + "-" * 80)
print("📉 DECLINING TOPICS (reduced prominence):")
print("-" * 80)
declining = [t for t, c in trend_classifications.items() if c == 'declining']
if declining:
    for t in declining:
        print(f"    • {topic_labels_map.get(t, f'T{t}')}")
else:
    print("    (none detected)")

print("\n" + "-" * 80)
print("➡️  STABLE TOPICS (consistent presence):")
print("-" * 80)
stable = [t for t, c in trend_classifications.items() if c == 'stable']
if stable:
    for t in stable:
        print(f"    • {topic_labels_map.get(t, f'T{t}')}")
else:
    print("    (none detected)")

print("\n" + "-" * 80)
print("⛰️  PEAKED TOPICS (rose then declined):")
print("-" * 80)
peaked = [t for t, c in trend_classifications.items() if c == 'peaked']
if peaked:
    for t in peaked:
        peak_year = topic_trends[t].idxmax()
        print(f"    • {topic_labels_map.get(t, f'T{t}')} (peak ~{peak_year})")
else:
    print("    (none detected)")

print("\n" + "-" * 80)
print("🕐 RESEARCH ERAS:")
print("-" * 80)
for era in eras:
    era_mask = (topic_trends.index >= era['start']) & (topic_trends.index <= era['end'])
    era_means = topic_trends[era_mask].mean()
    top3 = era_means.nlargest(3).index.tolist()
    print(f"\n  {era['label']}")
    print(f"  Dominant topics:")
    for t in top3:
        print(f"    • {topic_labels_map.get(t, f'T{t}')}")

print("\n" + "=" * 80)
print("Note: Interpret these findings considering venue composition changes")
print("(InfoVis/SciVis/VAST merger into VIS in 2021) and overall publication")
print("volume growth over the 35-year span.")
print("=" * 80)

================================================================================
FINDINGS: Research Topic Evolution in IEEE VIS (1990–2024)
================================================================================

📊 Dataset: 3877 papers analysed
📅 Time span: 1990–2024
🏷️  Topics discovered: 12 (via NMF)
📐 Smoothing: loess (frac=0.15)

--------------------------------------------------------------------------------
📈 RISING TOPICS (gaining prominence):
--------------------------------------------------------------------------------
    • T0: visual, analytics, visual analytics, analysis, users
    • T5: graph, layout, graphs, node, tree
    • T6: network, networks, social, traffic, challenge
    • T7: time, time series, series, time varying, temporal
    • T8: model, models, learning, machine, deep
    • T9: uncertainty, uncertainty visualization, ensemble, uncertain, plots

--------------------------------------------------------------------------------
📉 DECLINING TOPICS (reduced prominence):
--------------------------------------------------------------------------------
    • T1: volume, rendering, volume rendering, transfer, ray
    • T2: flow, flow visualization, fluid, vortex, visualization
    • T10: vector, fields, tensor, field, vector fields
    • T11: surface, surfaces, mesh, algorithm, meshes

--------------------------------------------------------------------------------
➡️  STABLE TOPICS (consistent presence):
--------------------------------------------------------------------------------
    • T3: visualization, design, visualizations, information, information visualization
    • T4: data, sets, dimensional, analysis, data sets

--------------------------------------------------------------------------------
⛰️  PEAKED TOPICS (rose then declined):
--------------------------------------------------------------------------------
    (none detected)

--------------------------------------------------------------------------------
🕐 RESEARCH ERAS:
--------------------------------------------------------------------------------

  Era 1 (1990–1994)
  Dominant topics:
    • T3: visualization, design, visualizations, information, information visualization
    • T4: data, sets, dimensional, analysis, data sets
    • T1: volume, rendering, volume rendering, transfer, ray

  Era 2 (1995–2000)
  Dominant topics:
    • T11: surface, surfaces, mesh, algorithm, meshes
    • T3: visualization, design, visualizations, information, information visualization
    • T4: data, sets, dimensional, analysis, data sets

  Era 3 (2001–2005)
  Dominant topics:
    • T11: surface, surfaces, mesh, algorithm, meshes
    • T1: volume, rendering, volume rendering, transfer, ray
    • T4: data, sets, dimensional, analysis, data sets

  Era 4 (2006–2016)
  Dominant topics:
    • T4: data, sets, dimensional, analysis, data sets
    • T3: visualization, design, visualizations, information, information visualization
    • T0: visual, analytics, visual analytics, analysis, users

  Era 5 (2017–2024)
  Dominant topics:
    • T3: visualization, design, visualizations, information, information visualization
    • T4: data, sets, dimensional, analysis, data sets
    • T8: model, models, learning, machine, deep

================================================================================
Note: Interpret these findings considering venue composition changes
(InfoVis/SciVis/VAST merger into VIS in 2021) and overall publication
volume growth over the 35-year span.
================================================================================

	Conference	Year	Title	DOI	Link	FirstPage	LastPage	PaperType	Abstract	AuthorNames-Deduped	AuthorNames	AuthorAffiliation	InternalReferences	AuthorKeywords	AminerCitationCount	CitationCount_CrossRef	PubsCited_CrossRef	Downloads_Xplore	Award	GraphicsReplicabilityStamp
0	Vis	2024	Interactive Design-of-Experiments: Optimizing ...	10.1109/tvcg.2024.3456356	http://dx.doi.org/10.1109/TVCG.2024.3456356	44.0	53.0	J	The optimization of cooling systems is importa...	Rainer Splechtna;Majid Behravan;Mario Jelovic;...	Rainer Splechtna;Majid Behravan;Mario Jelović;...	VRVis Research Center in Vienna, Austria;Virgi...	10.1109/tvcg.2013.124;10.1109/tvcg.2008.145;10...	Parameter space exploration	NaN	2.0	29.0	234.0	NaN	NaN
1	Vis	2024	Towards Dataset-Scale and Feature-Oriented Eva...	10.1109/tvcg.2024.3456398	http://dx.doi.org/10.1109/TVCG.2024.3456398	481.0	491.0	J	Recent advancements in Large Language Models (...	Sam Yu-Te Lee;Aryaman Bahukhandi;Dongyu Liu;Kw...	Sam Yu-Te Lee;Aryaman Bahukhandi;Dongyu Liu;Kw...	University of California, USA;University of Ca...	10.1109/tvcg.2017.2743858;10.1109/tvcg.2017.27...	Visual analytics,prompt engineering,,,text sum...	NaN	1.0	65.0	386.0	NaN	NaN
2	Vis	2024	KNowNEt:Guided Health Information Seeking from...	10.1109/tvcg.2024.3456364	http://dx.doi.org/10.1109/TVCG.2024.3456364	547.0	557.0	J	The increasing reliance on Large Language Mode...	Youfu Yan;Yu Hou;Yongkang Xiao;Rui Zhang;Qianw...	Youfu Yan;Yu Hou;Yongkang Xiao;Rui Zhang;Qianw...	Department of Computer Science and Engineering...	10.1109/tvcg.2022.3209408;10.1109/tvcg.2023.33...	Human-AI interactions,knowledge graph,,,conver...	NaN	1.0	60.0	632.0	HM	NaN
3	Vis	2024	VisEval: A Benchmark for Data Visualization in...	10.1109/tvcg.2024.3456320	http://dx.doi.org/10.1109/TVCG.2024.3456320	1301.0	1311.0	J	Translating natural language to visualization ...	Nan Chen;Yuge Zhang;Jiahang Xu;Kan Ren;Yuqing ...	Nan Chen;Yuge Zhang;Jiahang Xu;Kan Ren;Yuqing ...	Microsoft Research, USA;Microsoft Research, US...	10.1109/infvis.2005.1532136;10.1109/tvcg.2015....	Visualization evaluation,automatic visualizati...	NaN	1.0	75.0	625.0	BP	NaN
4	Vis	2024	PUREsuggest: Citation-Based Literature Search ...	10.1109/tvcg.2024.3456199	http://dx.doi.org/10.1109/TVCG.2024.3456199	316.0	326.0	J	Citations allow quickly identifying related re...	Fabian Beck 0001	Fabian Beck	University of Bamberg, Germany	10.1109/tvcg.2015.2467757;10.1109/tvcg.2016.25...	Scientific literature search,citation network ...	NaN	1.0	62.0	165.0	NaN	NaN

	0	1	2	3	4	5	6	7	8	9	10	11
Year
1990	0.017499	0.150885	0.070490	0.142270	0.206133	0.019892	0.031317	0.026676	0.066336	0.007726	0.106225	0.154550
1991	0.013029	0.194580	0.081116	0.202376	0.134858	0.021010	0.034037	0.050158	0.028064	0.011915	0.128562	0.100296
1992	0.014379	0.157816	0.097976	0.197617	0.138313	0.015212	0.022475	0.055724	0.061626	0.005949	0.088905	0.144008
1993	0.019684	0.149600	0.162941	0.133211	0.179983	0.014388	0.025197	0.040833	0.074366	0.004040	0.074227	0.121529
1994	0.010579	0.161102	0.157580	0.157355	0.135910	0.019649	0.009351	0.045824	0.049829	0.008651	0.105807	0.138364

	0	1	2	3	4	5	6	7	8	9	10	11
Year
1990	0.015946	0.151513	0.069619	0.142857	0.206133	0.020950	0.033077	0.032384	0.064389	0.009559	0.110394	0.155209
1991	0.015403	0.153255	0.083054	0.170645	0.172223	0.018809	0.029715	0.042194	0.064210	0.008453	0.100744	0.148085
1992	0.015468	0.157816	0.097976	0.197617	0.138313	0.016635	0.026530	0.049901	0.060260	0.007104	0.093278	0.144008
1993	0.016157	0.154914	0.127778	0.177486	0.137111	0.016117	0.020127	0.046557	0.063083	0.005911	0.078240	0.135413
1994	0.013336	0.150125	0.157580	0.157355	0.135910	0.026411	0.019216	0.047782	0.056907	0.009264	0.068520	0.135491

Research Topic Evolution in IEEE VIS (1990–2024)¶

Setup & Dependencies¶

Phase 1: Data Preparation & Topic Discovery¶

Load Data¶

T1: Characterise (Vectorise Text using TF-IDF)¶

T2: Build Model (NMF Topic Model)¶

T3: Characterise (Topic Assignments & Top Terms)¶

T4: Contextualise (Dimensionality Reduction for Visualisation)¶

T5: Visualise (Topic Overview)¶

T6: Assess (Human Evaluation)¶

Phase 2: Temporal Topic Profiling¶

T8: Characterise (Temporal Aggregation)¶

T9: Characterise (Temporal Smoothing)¶

T10 & T11: Contextualise & Visualise (Temporal Evolution)¶

T12: Assess (Smoothing Adequacy)¶

Phase 3: Trend Interpretation & Knowledge Generation¶

T14: Abstract (Identify Patterns)¶

T15: Define-Unit (Era Segmentation)¶

T16: Visualise (Annotated Timeline)¶

T17: Generate Knowledge (Summary Report)¶

Workflow Complete¶

Workflow Summary (ATWL Template):¶

Dependencies:¶

Next Steps:¶