User Study¶
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import harmonicsonification as hs
hs.seed_everything(42)
Data Set Creation¶
We want to test whether our harmonic sonification approach allows users to distinguish typical data points from outliers just based on the sound. For this, we first create an artificial data set with the characteristics typically obtained from applying PCA (i.e. dimensions with decreasing variance). We then create an outlier data set with uniformly distributed points (some of these points will be like typical data points, but most will fall outside the distribution).
dim = 16 # dimensions
n_data = 100 # number of data points
std = np.exp(-np.linspace(0, 4, dim)) # standard deviations
uniform = 3 # width of the uniform outlier distribution
columns = [f"dim {i + 1}" for i in range(dim)]
data = pd.DataFrame(np.random.normal(size=(n_data, dim)) * std, columns=columns)
outlier = pd.DataFrame(np.random.uniform(-uniform, uniform, size=(n_data, dim)), columns=columns)
data['type'] = 'data'
outlier['type'] = 'outlier'
print(f"{dim} dimensions")
print(f"{n_data} data points")
print(f"std: {std.round(2)}")
print(f"outliers in [-{uniform}, {uniform}]")
plt.plot(std, '-o');
16 dimensions 100 data points std: [1. 0.77 0.59 0.45 0.34 0.26 0.2 0.15 0.12 0.09 0.07 0.05 0.04 0.03 0.02 0.02] outliers in [-3, 3]
We can create scatter plots for all pairs of dimensions, which gives a rough idea of the two distributions.
combined = pd.concat([outlier, data], ignore_index=True)
p = sns.pairplot(combined, hue='type', palette=['blue', 'red'], diag_kind=None)
for ax in p.axes.flatten():
ax.set_xlim(-uniform, uniform)
ax.set_ylim(-uniform, uniform)
Sonification¶
Lowest and highest frequency that may appear in the sonification, just for reference.
base_freq = 110
amps = np.zeros(dim)
amps[-1] = 1
hs.sonify_am(x=amps, f0=base_freq).display()
Helper functions to extract data points, randomise their order (for the experiment), and sonify them
def get_points(data, outlier, shuffle):
points = []
for d, l in [(data, 'data'), (outlier, 'outlier')]:
if d is not None:
d = d[:,:-1]
points += [(d, f'{l} {i + 1}') for i, d in enumerate(d)]
if shuffle:
random.shuffle(points)
points, labels = list(zip(*points))
return np.array(points), labels
def sonify(points, labels, std, base_freq, add_fundamental=True, label=False, print_amps=False):
points = np.abs(points)
points /= std[None, :]
points /= points.max()
for i, (p, l) in enumerate(zip(points, labels)):
if add_fundamental:
amps = [1] + list(p)
else:
amps = p
amps = np.array(amps, dtype=float)
if label:
print(l)
else:
print(f"point {i + 1}")
if print_amps:
print(amps.round(2))
hs.sonify_am(x=amps, f0=base_freq).display()
Example Data¶
Here are some typical data points as well as some outliers.
n_examples = 5
points, labels = get_points(data.values[:n_examples], outlier.values[:n_examples], False)
sonify(points, labels, std, base_freq, label=True)
data 1
data 2
data 3
data 4
data 5
outlier 1
outlier 2
outlier 3
outlier 4
outlier 5
Trials¶
Now we get some data points and outliers shuffle them randomly and let participants guess.
n_test = 10
points, labels = get_points(data=data.values[n_examples:n_examples+n_test],
outlier=outlier.values[n_examples:n_examples+n_test],
shuffle=True)
For the evaluation, we once print their correct labels.
for l in labels:
print(l)
print("--------------------")
sonify(points, labels, std, base_freq, label=True)
outlier 10 data 6 outlier 5 data 5 data 10 outlier 4 outlier 6 outlier 9 data 7 outlier 3 outlier 8 outlier 1 data 2 outlier 2 data 3 outlier 7 data 8 data 9 data 1 data 4 -------------------- outlier 10
data 6
outlier 5
data 5
data 10
outlier 4
outlier 6
outlier 9
data 7
outlier 3
outlier 8
outlier 1
data 2
outlier 2
data 3
outlier 7
data 8
data 9
data 1
data 4
Now we just print a point index (this is shown to the participants)
sonify(points, labels, std, base_freq)
point 1
point 2
point 3
point 4
point 5
point 6
point 7
point 8
point 9
point 10
point 11
point 12
point 13
point 14
point 15
point 16
point 17
point 18
point 19
point 20