Tutorial 5: Group Analysis¶
Overview¶
Learn to aggregate biomechanical data across multiple subjects, compute group statistics, and perform population-level analyses.
Learning Objectives¶
- Aggregate data across subjects
- Compute ensemble averages with confidence intervals
- Handle missing data appropriately
- Perform statistical comparisons between groups
- Create normative reference data
Setup¶
% Add library to path
addpath('user_libs/matlab');
% Load data
loco = LocomotionData('converted_datasets/umich_2021_phase.parquet');
% Get all subjects
allSubjects = loco.getSubjects();
fprintf('Total subjects: %d\n', length(allSubjects));
% Add helper functions to path
addpath('user_libs/matlab');
% Load data
data = parquetread('converted_datasets/umich_2021_phase.parquet');
% Get all subjects
allSubjects = unique(data.subject);
fprintf('Total subjects: %d\n', length(allSubjects));
Multi-Subject Aggregation¶
Computing Group Means¶
% Aggregate data across all subjects for level walking
task = 'level_walking';
features = {'knee_flexion_angle_ipsi_rad', 'hip_flexion_angle_ipsi_rad'};
% Initialize storage for all subjects
allSubjectMeans = [];
allSubjectStds = [];
subjectCount = 0;
for i = 1:length(allSubjects)
subject = allSubjects{i};
% Filter data for this subject and task
subjectData = loco.filterSubject(subject).filterTask(task);
if subjectData.length() > 0
% Get mean patterns for this subject
meanPatterns = subjectData.getMeanPatterns(subject, task);
stdPatterns = subjectData.getStdPatterns(subject, task);
% Store subject data
if subjectCount == 0
% Initialize arrays
nFeatures = length(features);
nPoints = 150;
allSubjectMeans = NaN(length(allSubjects), nPoints, nFeatures);
allSubjectStds = NaN(length(allSubjects), nPoints, nFeatures);
end
subjectCount = subjectCount + 1;
for f = 1:length(features)
feature = features{f};
if isfield(meanPatterns, feature)
allSubjectMeans(subjectCount, :, f) = meanPatterns.(feature);
allSubjectStds(subjectCount, :, f) = stdPatterns.(feature);
end
end
end
end
fprintf('Successfully processed %d subjects\n', subjectCount);
% Using aggregateSubjects helper function
task = 'level_walking';
features = {'knee_flexion_angle_ipsi_rad', 'hip_flexion_angle_ipsi_rad'};
% Aggregate knee flexion data
kneeAgg = aggregateSubjects(data, allSubjects, task, features{1});
% Or manually aggregate
allSubjectMeans = [];
allSubjectStds = [];
subjectCount = 0;
for i = 1:length(allSubjects)
subject = allSubjects{i};
% Filter data for this subject and task
subjectData = data(strcmp(data.subject, subject) & ...
strcmp(data.task, task), :);
if height(subjectData) > 0
% Store subject data
if subjectCount == 0
% Initialize arrays
nFeatures = length(features);
nPoints = 150;
allSubjectMeans = NaN(length(allSubjects), nPoints, nFeatures);
allSubjectStds = NaN(length(allSubjects), nPoints, nFeatures);
end
subjectCount = subjectCount + 1;
for f = 1:length(features)
feature = features{f};
if ismember(feature, subjectData.Properties.VariableNames)
[meanCurve, stdCurve] = computePhaseAverage(subjectData, feature);
allSubjectMeans(subjectCount, :, f) = meanCurve;
allSubjectStds(subjectCount, :, f) = stdCurve;
end
end
end
end
fprintf('Successfully processed %d subjects\n', subjectCount);
Ensemble Averages¶
% Compute ensemble averages across subjects
phase = 0:100/149:100;
figure('Position', [100 100 1200 600]);
for f = 1:length(features)
subplot(1, length(features), f);
% Get data for this feature
featureData = squeeze(allSubjectMeans(1:subjectCount, :, f));
% Compute ensemble statistics
ensembleMean = mean(featureData, 1, 'omitnan');
ensembleStd = std(featureData, 0, 1, 'omitnan');
ensembleSEM = ensembleStd / sqrt(subjectCount);
% Convert to degrees if angle
if contains(features{f}, 'angle')
ensembleMean = rad2deg(ensembleMean);
ensembleStd = rad2deg(ensembleStd);
ensembleSEM = rad2deg(ensembleSEM);
unitLabel = '(deg)';
else
unitLabel = '';
end
% Plot ensemble mean ± SEM
hold on;
fill([phase, fliplr(phase)], ...
[ensembleMean + ensembleSEM, fliplr(ensembleMean - ensembleSEM)], ...
[0.7 0.7 1], 'FaceAlpha', 0.3, 'EdgeColor', 'none');
plot(phase, ensembleMean, 'b-', 'LineWidth', 2);
xlabel('Gait Cycle (%)');
ylabel(sprintf('%s %s', strrep(features{f}, '_', ' '), unitLabel));
title(sprintf('Group Mean (n=%d)', subjectCount));
grid on;
xlim([0 100]);
end
sgtitle(sprintf('Ensemble Averages - %s', strrep(task, '_', ' ')));
Statistical Analysis¶
Group Comparisons¶
% Compare two groups (example: young vs old, or healthy vs pathological)
% For demonstration, split subjects into two groups randomly
group1Subjects = allSubjects(1:floor(length(allSubjects)/2));
group2Subjects = allSubjects(floor(length(allSubjects)/2)+1:end);
fprintf('Group 1: %d subjects\n', length(group1Subjects));
fprintf('Group 2: %d subjects\n', length(group2Subjects));
% Function to get group data
function groupData = getGroupData(loco, subjects, task, features)
groupData = [];
validCount = 0;
for i = 1:length(subjects)
subject = subjects{i};
subjectData = loco.filterSubject(subject).filterTask(task);
if subjectData.length() > 0
meanPatterns = subjectData.getMeanPatterns(subject, task);
validCount = validCount + 1;
if validCount == 1
nFeatures = length(features);
nPoints = 150;
groupData = NaN(length(subjects), nPoints, nFeatures);
end
for f = 1:length(features)
feature = features{f};
if isfield(meanPatterns, feature)
groupData(validCount, :, f) = meanPatterns.(feature);
end
end
end
end
% Trim to valid data
groupData = groupData(1:validCount, :, :);
end
% Get data for both groups
group1Data = getGroupData(loco, group1Subjects, task, features);
group2Data = getGroupData(loco, group2Subjects, task, features);
% Statistical comparison at each phase point
feature = features{1}; % Knee flexion
f = 1;
group1Feature = squeeze(group1Data(:, :, f));
group2Feature = squeeze(group2Data(:, :, f));
% Perform t-tests at each phase point
pValues = zeros(1, 150);
for p = 1:150
g1_vals = group1Feature(:, p);
g2_vals = group2Feature(:, p);
% Remove NaN values
g1_clean = g1_vals(~isnan(g1_vals));
g2_clean = g2_vals(~isnan(g2_vals));
if length(g1_clean) > 1 && length(g2_clean) > 1
[~, pValues(p)] = ttest2(g1_clean, g2_clean);
else
pValues(p) = NaN;
end
end
% Plot comparison
figure();
subplot(2, 1, 1);
hold on;
% Group 1
g1Mean = mean(group1Feature, 1, 'omitnan');
g1SEM = std(group1Feature, 0, 1, 'omitnan') / sqrt(size(group1Feature, 1));
fill([phase, fliplr(phase)], ...
rad2deg([g1Mean + g1SEM, fliplr(g1Mean - g1SEM)]), ...
[1 0.7 0.7], 'FaceAlpha', 0.3, 'EdgeColor', 'none');
plot(phase, rad2deg(g1Mean), 'r-', 'LineWidth', 2, 'DisplayName', 'Group 1');
% Group 2
g2Mean = mean(group2Feature, 1, 'omitnan');
g2SEM = std(group2Feature, 0, 1, 'omitnan') / sqrt(size(group2Feature, 1));
fill([phase, fliplr(phase)], ...
rad2deg([g2Mean + g2SEM, fliplr(g2Mean - g2SEM)]), ...
[0.7 0.7 1], 'FaceAlpha', 0.3, 'EdgeColor', 'none');
plot(phase, rad2deg(g2Mean), 'b-', 'LineWidth', 2, 'DisplayName', 'Group 2');
xlabel('Gait Cycle (%)');
ylabel('Knee Flexion (deg)');
title('Group Comparison');
legend('show');
grid on;
% P-values plot
subplot(2, 1, 2);
plot(phase, pValues, 'k-', 'LineWidth', 1.5);
yline(0.05, 'r--', 'α = 0.05');
xlabel('Gait Cycle (%)');
ylabel('p-value');
title('Statistical Significance');
ylim([0 1]);
grid on;
Normative Data Creation¶
Reference Ranges¶
% Create normative reference ranges (mean ± 2SD)
feature = features{1};
f = 1;
% Use all subjects as normative population
normativeData = squeeze(allSubjectMeans(1:subjectCount, :, f));
% Compute reference ranges
normMean = mean(normativeData, 1, 'omitnan');
normStd = std(normativeData, 0, 1, 'omitnan');
% Reference ranges (mean ± 2SD)
upperBound = normMean + 2 * normStd;
lowerBound = normMean - 2 * normStd;
% Convert to degrees
normMeanDeg = rad2deg(normMean);
upperBoundDeg = rad2deg(upperBound);
lowerBoundDeg = rad2deg(lowerBound);
% Plot normative ranges
figure();
hold on;
% Reference band
fill([phase, fliplr(phase)], [upperBoundDeg, fliplr(lowerBoundDeg)], ...
[0.9 0.9 0.9], 'EdgeColor', 'none', 'DisplayName', 'Normal Range (±2SD)');
% Mean line
plot(phase, normMeanDeg, 'k-', 'LineWidth', 2, 'DisplayName', 'Normal Mean');
% Boundary lines
plot(phase, upperBoundDeg, 'k--', 'LineWidth', 1, 'DisplayName', 'Upper/Lower Bounds');
plot(phase, lowerBoundDeg, 'k--', 'LineWidth', 1);
xlabel('Gait Cycle (%)');
ylabel('Knee Flexion (deg)');
title(sprintf('Normative Reference Data (n=%d)', subjectCount));
legend('show');
grid on;
xlim([0 100]);
% Save normative data
normativeTable = table(phase', normMeanDeg', upperBoundDeg', lowerBoundDeg', ...
'VariableNames', {'phase_percent', 'mean', 'upper_bound', 'lower_bound'});
% Display summary
fprintf('\nNormative Data Summary:\n');
fprintf('======================\n');
fprintf('Feature: %s\n', feature);
fprintf('Population size: %d subjects\n', subjectCount);
fprintf('Peak flexion: %.1f ± %.1f deg\n', max(normMeanDeg), std(max(normativeData, [], 2), 'omitnan') * 180/pi);
fprintf('Range of motion: %.1f ± %.1f deg\n', ...
mean(max(normativeData, [], 2) - min(normativeData, [], 2)) * 180/pi, ...
std(max(normativeData, [], 2) - min(normativeData, [], 2)) * 180/pi);
Best Practices¶
Handling Missing Data¶
% Check data completeness across subjects
fprintf('Data Completeness Report:\n');
fprintf('========================\n');
for i = 1:length(allSubjects)
subject = allSubjects{i};
subjectData = loco.filterSubject(subject).filterTask(task);
if subjectData.length() > 0
% Check for missing features
meanPatterns = subjectData.getMeanPatterns(subject, task);
availableFeatures = fieldnames(meanPatterns);
missingFeatures = setdiff(features, availableFeatures);
if ~isempty(missingFeatures)
fprintf('%s: Missing %s\n', subject, strjoin(missingFeatures, ', '));
end
else
fprintf('%s: No %s data\n', subject, task);
end
end
Statistical Power Analysis¶
% Estimate required sample size for group comparisons
effect_size = 0.5; % Cohen's d
alpha = 0.05;
power = 0.8;
% Approximate sample size calculation (per group)
% This is a simplified calculation - use proper power analysis tools for research
z_alpha = norminv(1 - alpha/2);
z_beta = norminv(power);
n_approx = 2 * ((z_alpha + z_beta) / effect_size)^2;
fprintf('\nPower Analysis (Approximate):\n');
fprintf('============================\n');
fprintf('Effect size (Cohen''s d): %.1f\n', effect_size);
fprintf('Alpha level: %.2f\n', alpha);
fprintf('Power: %.1f%%\n', power * 100);
fprintf('Estimated sample size per group: %.0f\n', ceil(n_approx));
fprintf('Current group sizes: %d, %d\n', size(group1Data, 1), size(group2Data, 1));
Summary¶
You've learned to perform group-level biomechanical analysis in MATLAB:
- Multi-subject aggregation with proper data handling
- Ensemble averaging with confidence intervals
- Statistical comparisons between groups
- Normative reference data creation
- Quality assessment and missing data handling
Next Steps¶
Continue to Tutorial 6: Publication Outputs →
Learn to create publication-ready figures, tables, and ensure reproducibility of your analyses.