import tqdm
import os 
import torch
import numpy as np
#
filename_triplet_list = "triplet_data.csv" # opensource
dataset_path = "dataset"
get_func = None
distance = None

def read_csv(filename):
    lines = open(filename, 'r').readlines()
    ind = lines[0].strip().split(',')
    data = []
    for line in lines[1:]:
        data.append({name:value for name, value in zip(ind, line.strip().split(',')) })
    return data

lines = read_csv(filename_triplet_list)
print(f"Number of Triplet Data:{len(lines)}")
print("Triplet Data Sample:")
for name, value in lines[0].items():
    print(f"\t{name}:\t\t{value}")

Number of Triplet Data:8461
Triplet Data Sample:
	:		0
	triplet_id:		triplet_83_B
	TGT_item:		amelia_consonants_10.wav
	OTH_item:		amelia_consonants_28.wav
	X_item:		ewan_58.wav
	speaker_TGT:		amelia
	speaker_OTH:		amelia
	speaker_X:		ewan
	language_TGT:		EN
	language_OTH:		EN
	language_X:		EN
	TGT_first:		False
	dataset:		pilot-aug-2018

from IPython.display import Audio, display
sample_data = lines[0] 
TGT_func = lambda x : os.path.join(dataset_path, x["dataset"], "wavs_extracted", x["TGT_item"])
OTH_func = lambda x : os.path.join(dataset_path, x["dataset"], "wavs_extracted", x["OTH_item"])
X_func = lambda x : os.path.join(dataset_path, x["dataset"], "wavs_extracted", x["X_item"])
TGT_audio_sample = TGT_func(sample_data)
OTH_audio_sample = OTH_func(sample_data)
X_audio_sample = X_func(sample_data)
print("Sample A:")
display(Audio(filename=TGT_audio_sample,rate=16000))
print("Sample B:")
display(Audio(filename=OTH_audio_sample,rate=16000))
print("Sample X:")
display(Audio(filename=X_audio_sample,rate=16000))

Sample A:

Sample B:

Sample X:

human_and_models_filename = "humans_and_models/file_data.csv" # opensource
human_and_models_data = read_csv(human_and_models_filename)
print(f"Number of Human Behavioural Experiments:{len(human_and_models_data)}")
for i in human_and_models_data:
    if 'triplet_83_B' == i["triplet_id"]:
        print("Data Sample (triplet_83_B):") 
        for name, value in i.items():
            print(f"\t{name}:\t\t\t\t{value}")
        break

Number of Human Behavioural Experiments:87631
Data Sample (triplet_83_B):
	:				0
	subject_id:				X_IHTYDKWOUB
	subject_language:				EN
	triplet_id:				triplet_83_B
	TGT_item:				amelia_consonants_10.wav
	OTH_item:				amelia_consonants_28.wav
	X_item:				ewan_58.wav
	speaker_TGT:				amelia
	speaker_OTH:				amelia
	speaker_X:				ewan
	language_TGT:				EN
	language_OTH:				EN
	language_X:				EN
	TGT_first:				False
	user_ans:				1
	bin_user_ans:				1
	phone_TGT:				t
	phone_OTH:				f
	phone_X:				t
	prev_phone:				a
	next_phone:				ɑ
	context:				a_ɑ
	nb_stimuli:				0
	dataset:				pilot-aug-2018
	deepspeech_englishtxt_rnn4:				0.15936934043274648
	wav2vec_english_transf4:				0.12251163941376364
	deepspeech_frenchtxt_rnn4:				0.1629706981896903
	cpc_french_AR:				0.05872012213169375
	wav2vec_french_transf4:				0.04093575305274538
	hubert_french_transf_5:				0.04855605986577621
	cpc_audioset_AR:				-0.020788736911799943
	mfccs_la:				-0.0025078723330559174
	cpc_english_AR:				0.050190139536198775
	hubert_english_transf_5:				0.08956584890180974
	wav2vec_audioset_transf4:				0.0319468904371876
	hubert_audioset_transf_5:				0.04221348163325256
	deepspeech_french_rnn4:				0.18394535542149
	deepspeech_english_rnn4:				0.18102908468174178

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf

# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").eval()
 
# read target audio with soundfiles
TGT_audio_data, sr = sf.read(TGT_audio_sample)

# tokenize
input_values = processor(TGT_audio_data, return_tensors="pt", padding="longest").input_values  # Batch size 1

# retrieve model output including logits and hidden representation
output = model(input_values, output_hidden_states=True)

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.

logits = output.logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(f"transcription:{transcription}")

transcription:['AHTA']

print(model.wav2vec2.encoder.layers)
# hidden state
print(f"# of Layers:{len(output.hidden_states)}")
print(f"Shape of each Layer:{output.hidden_states[0].shape}")

ModuleList(
  (0-11): 12 x Wav2Vec2EncoderLayer(
    (attention): Wav2Vec2Attention(
      (k_proj): Linear(in_features=768, out_features=768, bias=True)
      (v_proj): Linear(in_features=768, out_features=768, bias=True)
      (q_proj): Linear(in_features=768, out_features=768, bias=True)
      (out_proj): Linear(in_features=768, out_features=768, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
    (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (feed_forward): Wav2Vec2FeedForward(
      (intermediate_dropout): Dropout(p=0.1, inplace=False)
      (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
      (output_dense): Linear(in_features=3072, out_features=768, bias=True)
      (output_dropout): Dropout(p=0.1, inplace=False)
    )
    (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)
# of Layers:13
Shape of each Layer:torch.Size([1, 31, 768])

def get_wav2vec_representation(audio_sample, layer=-1):
    audio_data, sr = sf.read(audio_sample)
    input_values = processor(audio_data, sampling_rate=16000, return_tensors="pt", padding="longest").input_values  # Batch size 1
    output = model(input_values, output_hidden_states=True)
    return output.hidden_states[layer].squeeze().detach()

import librosa
def get_mfcc_representation(filename,layer=-1):
    y, sr = librosa.load(filename)
    spect = librosa.feature.mfcc(
        y=y,
        sr=16000,
        n_mfcc=13,
        win_length=int(0.025 * sr),
        hop_length=int(0.010 * sr),
    )

    spect = spect.T
    return spect

TGT = get_wav2vec_representation(TGT_audio_sample, layer=6)
OTH = get_wav2vec_representation(OTH_audio_sample, layer=6)
X = get_wav2vec_representation(X_audio_sample, layer=6)
TGT.shape, OTH.shape, X.shape

(torch.Size([31, 768]), torch.Size([34, 768]), torch.Size([25, 768]))

from dtw_experiment import compute_dtw
TGT = get_wav2vec_representation(TGT_audio_sample, layer=6)
OTH = get_wav2vec_representation(OTH_audio_sample, layer=6)
X = get_wav2vec_representation(X_audio_sample, layer=6)
distance = "cosine"
TGTX = compute_dtw(TGT, X, distance, norm_div=True)
OTHX = compute_dtw(OTH, X, distance, norm_div=True)

wav2vec_delta = OTHX - TGTX
wav2vec_delta

0.07953007742150814

from dtw_experiment import compute_dtw
TGT = get_mfcc_representation(TGT_audio_sample)
OTH = get_mfcc_representation(OTH_audio_sample)
X = get_mfcc_representation(X_audio_sample)
distance = "cosine"
TGTX = compute_dtw(TGT, X, distance, norm_div=True)
OTHX = compute_dtw(OTH, X, distance, norm_div=True)

mfcc_delta = OTHX - TGTX
mfcc_delta

-0.0028166435984554386

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
model.config.forced_decoder_ids = None
sample = TGT_audio_sample
audio_data, sr = sf.read(sample)
input_features = processor(audio_data, sampling_rate=16000, return_tensors="pt").input_features
# generate token ids
predicted_ids = model.generate(inputs=input_features, output_hidden_states=True)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> .<|endoftext|>']

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

[' ATA']

def get_whiser_representation(audio_sample, layer=-1):
    audio_data, sr = sf.read(audio_sample)
    input_features_nopad = processor(audio_data, sampling_rate=16000, padding="do_not_pad", return_tensors="pt").input_features
    input_features = processor(audio_data, sampling_rate=16000, return_tensors="pt").input_features
    decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
    output = model(input_features, output_hidden_states=True, decoder_input_ids=decoder_input_ids)
    return output.encoder_last_hidden_state[:,:input_features_nopad[0].shape[1],:].squeeze().detach()

from dtw_experiment import compute_dtw
TGT = get_whiser_representation(TGT_audio_sample)
OTH = get_whiser_representation(OTH_audio_sample)
X = get_whiser_representation(X_audio_sample)
distance = "cosine"
TGTX = compute_dtw(TGT, X, distance, norm_div=True)
OTHX = compute_dtw(OTH, X, distance, norm_div=True)

whisper_delta = OTHX - TGTX
whisper_delta

-0.018564086530874824

from dtw_experiment import compute_dtw
def get_delta(
    TGT_audio_sample, 
    OTH_audio_sample,
    X_audio_sample,
    func):
    TGT = func(TGT_audio_sample)
    OTH = func(OTH_audio_sample)
    X = func(X_audio_sample)
    distance = "cosine"
    TGTX = compute_dtw(TGT, X, distance, norm_div=True)
    OTHX = compute_dtw(OTH, X, distance, norm_div=True)
    
    delta = OTHX - TGTX
    return delta

triplets_data = read_csv(filename_triplet_list)
triplets_result = {}
func = get_mfcc_representation
for triplet_item in tqdm.tqdm(triplets_data, desc="Computing delta values...."):
    TGT_audio_sample = TGT_func(triplet_item)
    OTH_audio_sample = OTH_func(triplet_item)
    X_audio_sample = X_func(triplet_item)
    triplets_result[triplet_item["triplet_id"]] = get_delta(TGT_audio_sample,OTH_audio_sample,X_audio_sample,func)

for item in human_and_models_data:
    #item[model_name] = triplets_result[item["triplet_id"]]
    for k in item:
        if k in ["TGT_first", "subject_id", "dataset"]: continue
        try:
            item[k] = float(item[k])
        except:
            pass

from multiprocessing import Pool
import pandas as pd
from statsmodels.formula.api import probit
from sampling import get_dico_corres_file, sample_lines
model_name = "wav2vec_audioset_transf4"
dico_lines = get_dico_corres_file(human_and_models_filename, french=False, english = True)
lines_sampled = sample_lines(dico_lines)
data_ = pd.DataFrame(human_and_models_data)
data_['bin_user_ans'] = (data_['bin_user_ans'] + 1.) / 2  # we transform -1 1 into 0 1
data_['TGT_first'] = data_['TGT_first'].astype(bool)
data_['TGT_first_code'] = data_['TGT_first'].astype(int)
data = data_.iloc[lines_sampled]
data_worker = data.copy()
# normalize data
for val in ['nb_stimuli', model_name]:
    data_worker[val] = (data[val] -data[val].mean())/data[val].std()

# we create the probit model
model_probit = probit("bin_user_ans ~ C(subject_id) + C(dataset) + nb_stimuli + " + model_name, data_worker)

result_probit = model_probit.fit_regularized(max_iter=200, disp=True)
loglikehood = model_probit.loglike(result_probit.params)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5095092161564122
            Iterations: 644
            Function evaluations: 644
            Gradient evaluations: 644

result_probit.summary()

# pre calculated result
result_filename = "results.csv"
with open(result_filename, "r") as f:
    lines = f.readlines()
    idxs = lines[0].strip().split(",")[1:]
    result_data = []
    for line in lines[1:]: result_data.append([float(_) for _ in line.strip().split(",")][1:])
mean = np.average(result_data, axis=0) 
variance = np.var(result_data, axis=0)

x = np.arange(len(idxs)) 
rank_data = [[i,j] for i,j in zip(mean, idxs)]
rank_data = sorted(rank_data, key=lambda x : -x[0])
mean, idxs = [round(_[0]) for _ in rank_data],  [_[1] for _ in rank_data]

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
from pyecharts.charts import Bar
from pyecharts import options as opts
bar = (
 Bar(init_opts=opts.InitOpts())
 .add_xaxis(idxs)
 .add_yaxis("Log-likelihood values (shorter bars are better)", mean)
 .set_global_opts(title_opts=opts.TitleOpts(
                 title="Log-likelihood values", subtitle="For English participants"),
                  xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-90)),
                  yaxis_opts=opts.AxisOpts(min_=-21700, max_=-20000)
                )
)
bar.load_javascript()
bar.render("bar_chart_1.html")

'/ssd9/exec/penglinkai/brain_sci/Sel_supervised_models_perception_biases/bar_chart_1.html'

from IPython.display import IFrame
IFrame(src='./bar_chart_1.html', width=1000, height=600)

import pandas as pd
from scipy.stats import spearmanr

data = pd.read_csv(human_and_models_filename)
dico_lines_french = get_dico_corres_file(human_and_models_filename, french=True, english=False)
dico_lines_english = get_dico_corres_file(human_and_models_filename, french=False, english=True)
list_sampled_english = sample_lines(dico_lines_english)
dff = data.iloc[list_sampled_english]

value_evaluated = "wav2vec_audioset_transf4"
# We get only what we need
dff = dff[[
    'triplet_id', 
    'phone_TGT', 
    'phone_OTH', 
    'prev_phone', 
    'next_phone', 
    'language_OTH',
    'language_TGT', 
    'dataset',
    'user_ans', 
    value_evaluated]]

# We adapt to some dataset that have a -3 / 3 scale
dff.loc[dff['dataset'] == "WorldVowels", ['user_ans']] = dff.loc[dff['dataset'] == "WorldVowels", ['user_ans']] / 3.
dff.loc[dff['dataset'] == "zerospeech", ['user_ans']] = dff.loc[dff['dataset'] == "zerospeech", ['user_ans']] / 3.

# We average over triplet first
gf = dff.groupby([
    'triplet_id', 
    'phone_TGT', 
    'phone_OTH', 
    'prev_phone', 
    'next_phone', 
    'language_OTH',
    'language_TGT', 
    'dataset'], as_index = False)
ans_fr = gf.user_ans.mean()
val_fr = gf[value_evaluated].mean()
ans_fr[value_evaluated] = val_fr[value_evaluated]

len(list(gf)),list(gf)[0]

(8461,
 (('BR_TRIP10222_0', 'e', 'ĩ', 's', 'k', 'BR', 'BR', 'WorldVowels'),
             triplet_id phone_TGT phone_OTH prev_phone next_phone language_OTH  \
  26129  BR_TRIP10222_0         e        ĩ          s          k           BR   
  29265  BR_TRIP10222_0         e        ĩ          s          k           BR   
  29265  BR_TRIP10222_0         e        ĩ          s          k           BR   
  26129  BR_TRIP10222_0         e        ĩ          s          k           BR   
  28658  BR_TRIP10222_0         e        ĩ          s          k           BR   
  
        language_TGT      dataset  user_ans  wav2vec_audioset_transf4  
  26129           BR  WorldVowels -0.333333                  0.031396  
  29265           BR  WorldVowels  0.666667                  0.031396  
  29265           BR  WorldVowels  0.666667                  0.031396  
  26129           BR  WorldVowels -0.333333                  0.031396  
  28658           BR  WorldVowels  1.000000                  0.031396  ))

ans_fr

# Then we average over context
gf = ans_fr.groupby([
    'phone_TGT', 
    'phone_OTH', 
    'prev_phone', 
    'next_phone', 
    'language_OTH',
    'language_TGT', 
    'dataset'], as_index = False)
ans_fr = gf.user_ans.mean()
val_fr = gf[value_evaluated].mean()
ans_fr[value_evaluated] = val_fr[value_evaluated]

len(list(gf)),list(gf)[0]

(3401,
 (('a', 'aː', 'f', 'f', 'GL', 'GL', 'WorldVowels'),
           triplet_id phone_TGT phone_OTH prev_phone next_phone language_OTH  \
  1998  GL_TRIP1548_0         a        aː          f          f           GL   
  1999  GL_TRIP1548_1         a        aː          f          f           GL   
  2026  GL_TRIP2165_0         a        aː          f          f           GL   
  2027  GL_TRIP2165_1         a        aː          f          f           GL   
  2114   GL_TRIP945_0         a        aː          f          f           GL   
  2115   GL_TRIP945_1         a        aː          f          f           GL   
  
       language_TGT      dataset  user_ans  wav2vec_audioset_transf4  
  1998           GL  WorldVowels  0.866667                 -0.005935  
  1999           GL  WorldVowels  0.733333                 -0.005935  
  2026           GL  WorldVowels  0.266667                  0.062786  
  2027           GL  WorldVowels  0.133333                  0.062786  
  2114           GL  WorldVowels  0.533333                  0.020428  
  2115           GL  WorldVowels  0.600000                  0.020428  ))

ans_fr

# then we average over phone contrast
gf = ans_fr.groupby([
    'phone_TGT', 
    'phone_OTH', 
    'language_OTH',
    'language_TGT', 
    'dataset'], as_index=False)
ans_fr = gf.user_ans.mean()
val_fr = gf[value_evaluated].mean()
ans_fr[value_evaluated] = val_fr[value_evaluated]

len(list(gf)),list(gf)[0]

(1285,
 (('a', 'aː', 'GL', 'GL', 'WorldVowels'),
    phone_TGT phone_OTH prev_phone next_phone language_OTH language_TGT  \
  0         a        aː          f          f           GL           GL   
  1         a        aː          g          g           GL           GL   
  2         a        aː          p          p           GL           GL   
  
         dataset  user_ans  wav2vec_audioset_transf4  
  0  WorldVowels  0.522222                  0.025759  
  1  WorldVowels  0.322222                  0.044892  
  2  WorldVowels  0.200000                 -0.005515  ))

ans_fr

# the we average over order TGT-OTH or the other way around
res = ans_fr.copy()
res['phone_TGT'] = ans_fr['phone_OTH']
res['phone_OTH'] = ans_fr['phone_TGT']
res['language_OTH'] = ans_fr['language_TGT']
res['language_TGT'] = ans_fr['language_OTH']


total = pd.concat([ans_fr, res], axis=0)
gf = total.groupby(['phone_TGT', 'phone_OTH', 'language_OTH','language_TGT', 'dataset'], as_index=False)
ans_fr = gf.user_ans.mean()
val_fr = gf[value_evaluated].mean()
ans_fr[value_evaluated] = val_fr[value_evaluated]

ans_fr

rho_fr, p_fr = spearmanr(ans_fr['user_ans'], ans_fr[value_evaluated])
print(value_evaluated, rho_fr, p_fr)

wav2vec_audioset_transf4 0.3842111077640824 8.016630361775731e-48

# pre calculated result
result_filename = "beginning_outfile_english.csv"
with open(result_filename, "r") as f:
    lines = f.readlines()
    idxs = lines[0].strip().split(",")[1:]
    result_data = []
    for line in lines[1:]: result_data.append([float(_) for _ in line.strip().split(",")][1:])
mean = [round(_, 3) for _ in np.average(result_data, axis=0)]
variance = np.var(result_data, axis=0)

x = np.arange(len(idxs)) 
rank_data = [[i,j] for i,j in zip(mean, idxs)]
rank_data = sorted(rank_data, key=lambda x : -x[0])
mean, idxs = [_[0] for _ in rank_data],  [_[1] for _ in rank_data]

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
from pyecharts.charts import Bar
from pyecharts import options as opts
bar = (
 Bar(init_opts=opts.InitOpts())
 .add_xaxis(idxs)
 .add_yaxis("the Spearman correlation (ρ)", mean)
 .set_global_opts(title_opts=opts.TitleOpts(
                 title="the Spearman correlation (ρ)"),
                  xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-90)),
                  yaxis_opts=opts.AxisOpts(min_=0.3, max_=0.6)
                )
)
bar.load_javascript()
bar.render("bar_chart_2.html")

'/ssd9/exec/penglinkai/brain_sci/Sel_supervised_models_perception_biases/bar_chart_2.html'

from IPython.display import IFrame

IFrame(src='./bar_chart_2.html', width=1000, height=600)

data = pd.read_csv(human_and_models_filename, sep=',', encoding='utf-8')
dico_lines_french = get_dico_corres_file(human_and_models_filename, french=True, english=False)
dico_lines_english = get_dico_corres_file(human_and_models_filename, french=False, english=True)
list_sampled_french = sample_lines(dico_lines_french)
list_sampled_english = sample_lines(dico_lines_english)
lines_sampled = list_sampled_french + list_sampled_english

data = data.iloc[lines_sampled]
data.loc[data['dataset'] == "WorldVowels", ['user_ans']] = data.loc[data['dataset'] == "WorldVowels", ['user_ans']] / 3.
data.loc[data['dataset'] == "zerospeech", ['user_ans']] = data.loc[data['dataset'] == "zerospeech", ['user_ans']] / 3.
data_fr = data[data['subject_language'] == 'FR'].copy()
data_en = data[data['subject_language'] == 'EN'].copy()

dico_models = {'wav2vec_transf4':{'english':'wav2vec_english_transf4', 'french':'wav2vec_french_transf4'},
               'hubert':{'english':'hubert_english_transf_5', 'french':'hubert_french_transf_5'},
               'deepspeech_phon':{'english':'deepspeech_english_rnn4', 'french':'deepspeech_french_rnn4'},
               'cpc':{'english':'cpc_english_AR', 'french':'cpc_french_AR'},
               'deepspeech_txt': {'english': 'deepspeech_englishtxt_rnn4', 'french': 'deepspeech_frenchtxt_rnn4'},
               }
values_comparison_english = [dico_models[modi]['english'] for modi in dico_models]
values_comparison_french = [dico_models[modi]['french'] for modi in dico_models]

# We normalize english and french side for the model so they are comparable
for i in range(len(values_comparison_english)):
    data_fr[values_comparison_french[i]] = data_fr[values_comparison_french[i]] / data_fr[
        values_comparison_french[i]].std()
    data_en[values_comparison_english[i]] = data_en[values_comparison_english[i]] / data_en[
        values_comparison_english[i]].std()

# Select data interested, same as above cells
data_en['contrast'] = data_en['phone_TGT'] + ';' + data_en['phone_OTH'] + ';' +  data_en['language_OTH']  + ';' +  data_en['language_TGT'] + ';' +  data_en[ 'dataset']
data_fr['contrast'] = data_fr['phone_TGT'] + ';' + data_fr['phone_OTH'] + ';' + data_fr['language_OTH'] + ';' + data_fr['language_TGT'] + ';' + data_fr[ 'dataset']

# construct contrast data
data_fr_contrast = {}
data_en_contrast = {}

# assign a empty dict for later uses
for k in values_comparison_french + ['user_ans']:
    data_fr_contrast[k] = {}
for k in values_comparison_english + ['user_ans']:
    data_en_contrast[k] = {}

# loop each item, aggregate each contrast for each model 
for idx in range(len(data_en)):
    item = data_en.iloc[idx]
    contrast = item['contrast'].replace('"', "")
    for k in values_comparison_english + ['user_ans']:
        delta = float(item[k])
        data_en_contrast[k][contrast] = data_en_contrast[k].get(contrast, []) + [delta] # same as append()

for idx in range(len(data_fr)):
    item = data_fr.iloc[idx]
    contrast = item['contrast'].replace('"', "")
    for k in values_comparison_french + ['user_ans']:
        delta = float(item[k])
        data_fr_contrast[k][contrast] = data_fr_contrast[k].get(contrast, []) + [delta] # same as append()

# sample
# {'wav2vec_english_transf4': {
#   'i;ʊ;EN_DR7;EN_DR7;pilot-july-2018': [0.7581879995638313,0.7581879995638313,0.7581879995638313],
#   'ʊ;ʌ;EN_DR7;EN_DR7;pilot-july-2018': [0.44245396646602203,0.32058457305888916,0.32058457305888916],
#   'a;u;EN_DR2;EN_DR2;pilot-july-2018': [0.3934868173149888,0.3934868173149888,0.3934868173149888],
#   'æ;ʊ;EN_DR5;EN_DR5;pilot-july-2018': [0.7740108224569093,0.7740108224569093,0.7740108224569093],
#   ...

# we average the results
for k in values_comparison_english + ['user_ans']:
    for cont in data_en_contrast[k]:
        data_en_contrast[k][cont] = np.asarray(data_en_contrast[k][cont]).mean()
        
for k in values_comparison_french + ['user_ans']:
    for cont in data_fr_contrast[k]:
        data_fr_contrast[k][cont] = np.asarray(data_fr_contrast[k][cont]).mean()


# sample
# {'wav2vec_english_transf4': {
#   'i;ʊ;EN_DR7;EN_DR7;pilot-july-2018': 1.3776102894272364,
#   'ʊ;ʌ;EN_DR7;EN_DR7;pilot-july-2018': 0.44245396646602203,
#   'a;u;EN_DR2;EN_DR2;pilot-july-2018': 2.9639408282294744,
#   'æ;ʊ;EN_DR5;EN_DR5;pilot-july-2018': 0.7740108224569093,

triplet_list = list(data_en_contrast[values_comparison_english[0]].keys())
diff_humans = []
diff_models = {}
for i in range(len(values_comparison_english)):
    diff_models[values_comparison_english[i]] = []
triplet_done = []
diffs = []
for trip in triplet_list:
    if trip in triplet_done:
        continue
    # we average on TGT-OTH OTH-TGT
    other = trip.split(';')
    other = ';'.join([other[1], other[0], other[3], other[2], other[4]])
    triplet_done.append(other)
    triplet_done.append(trip)

    if trip in data_fr_contrast['user_ans'] and not trip in data_en_contrast['user_ans']:
        print('ERROR triplet not test on eng', trip)
        continue
    elif trip not in data_fr_contrast['user_ans'] and trip in data_en_contrast[
        'user_ans']:
        print('ERROR triplet not test on fre', trip)
        continue
    elif trip not in data_fr_contrast['user_ans'] and trip not in data_en_contrast[
        'user_ans']:
        print('ERROR triplet not test on fre and on en', trip)
        continue

    val_fr_human = (data_fr_contrast['user_ans'][trip] + data_fr_contrast['user_ans'].get(other, data_fr_contrast['user_ans'][trip])) / 2.
    val_en_human = (data_en_contrast['user_ans'][trip] + data_en_contrast['user_ans'].get(other,data_en_contrast['user_ans'][trip])) / 2.
    diff_humans.append(val_fr_human - val_en_human)
    for i in range(len(values_comparison_english)):
        # average on TGT-OTH OTH-TGT
        val_fr_model = (data_fr_contrast[values_comparison_french[i]][trip] +
                        data_fr_contrast[values_comparison_french[i]].get(other,data_fr_contrast[values_comparison_french[i]][trip])) / 2.
        val_en_model = (data_en_contrast[values_comparison_english[i]][trip] +
                        data_en_contrast[values_comparison_english[i]].get(other,data_en_contrast[values_comparison_english[i]][trip])) / 2.
        # core code
        diff_models[values_comparison_english[i]].append(val_fr_model - val_en_model)

diffs += [np.asarray(diff_humans)]
for i in range(len(values_comparison_english)):
    diffs += [diff_models[values_comparison_english[i]]]

from scipy.stats import pearsonr
rs = []
diff_humans = diffs[0]
for i in range(len((dico_models.keys()))):
    r, p = pearsonr(diffs[i+1], diff_humans)
    rs.append(r)
    print(r,p)

0.015723982716882613 0.6863389024697809
0.040789548947215024 0.29466542980729943
0.09660902151954646 0.01288942527914428
0.07633362519653619 0.049626580245979955
0.09889294029729122 0.010900056633808195

# pre calculated result
result_filename = "file_out.csv"
with open(result_filename, "r") as f:
    lines = f.readlines()
    idxs = lines[0].strip().split(",")[1:]
    result_data = []
    for line in lines[1:]: result_data.append([float(_) for _ in line.strip().split(",")][1:]) 

mean = np.mean(result_data, axis=0)
mean = [mean[_] for _ in [0,2,4,6,8]]
idxs = [idxs[_] for _ in [0,2,4,6,8]]
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
from pyecharts.charts import Bar
from pyecharts import options as opts
bar = (
 Bar(init_opts=opts.InitOpts())
 .add_xaxis(idxs)
 .add_yaxis("Native language effect", mean)
 .set_global_opts(title_opts=opts.TitleOpts(
                 title="Native language effect"),
                  xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-90)),
                  yaxis_opts=opts.AxisOpts(min_=0.0, max_=0.6)
                )
)
bar.load_javascript()
bar.render("bar_chart_4.html")

'/ssd9/exec/penglinkai/brain_sci/Sel_supervised_models_perception_biases/bar_chart_4.html'

from IPython.display import IFrame
IFrame(src='./bar_chart_4.html', width=1000, height=600)

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
%matplotlib inline
from scipy.stats import zscore, ttest_rel, f_oneway, ttest_ind
from scipy.signal import resample
from scipy.io import wavfile
from asccd import timit, preanalysis, erps, plotting, util
from asccd import temporal_receptive_field as trf

timit_annotation = timit.get_timit_annotations()
timit_annotation

timit_annotation.keys()

Index(['time', 'pitch', 'intensity', 'log_hz', 'erb_rate', 'rel_pitch_global',
       'rel_pitch_global_erb', 'abs_pitch', 'abs_pitch_erb',
       'abs_pitch_change', 'abs_pitch_erb_change', 'zscore_intensity', 'phn',
       'dorsal', 'coronal', 'labial', 'high', 'front', 'low', 'back',
       'plosive', 'fricative', 'syllabic', 'nasal', 'voiced', 'obstruent',
       'sonorant'],
      dtype='object')

num_sentences = len(set([i[0] for i in timit_annotation.to_dict()["time"].keys()]))
print("# of Timit dataset used:", num_sentences)
print("Play Druation of Timit dataset used:", len(timit_annotation)/100 + (num_sentences - 1) * 0.04)

# of Timit dataset used: 499
Play Druation of Timit dataset used: 1039.53

timit_syllables = timit.load_timit_syllables()
timit_syllables

timit_phonemes = timit.get_timit_phonemes()
timit_phonemes

subject = 'HS11'
all_hgs = [preanalysis.load_hg(subject, block) for block in range(9, 11)]
all_times = [preanalysis.load_times(subject, block) for block in range(9, 11)]
all_names = [timit.get_timit_names_for_timit_block(1), timit.get_timit_names_for_timit_block(5)]
len(all_hgs),all_hgs[0].shape,all_times[0].shape,len(all_names[0])

(2, (128, 32600), (1, 124), 124)

# get_timelocked_activity returns an array of shape (n_chans, n_timepoints, n_trials) 
# which has hg activity that is timelocked to times (in seconds). 
# Default time course returned is -0.2s to 1.0s. 
Y_mats = []
for onsets_one_block, hg in zip(all_times, all_hgs):
    Y_mats.append(erps.get_timelocked_activity(onsets_one_block, hg))
    print(Y_mats[-1].shape)
Y_mats = np.concatenate(Y_mats, axis=2)
print(Y_mats.shape)

(128, 120, 124)
(128, 120, 100)
(128, 120, 224)

# find speech responsive channels
Y_mat_before_onset = np.nanmean(Y_mats[:, :30, :], axis=1)
Y_mat_after_onset = np.nanmean(Y_mats[:, 35:, :], axis=1)
results = ttest_rel(Y_mat_before_onset, Y_mat_after_onset, axis=1)
responsive_chans = np.arange((len(results.pvalue)))[results.pvalue < 0.01/(len(results.pvalue))]
print("speech responsive channels: #", len(responsive_chans))
print(responsive_chans)

speech responsive channels: # 54
[ 37  38  39  40  41  48  49  50  52  53  54  55  56  57  58  64  65  66
  67  68  69  70  71  72  73  74  76  80  81  82  83  84  85  86  87  88
  89  90  96  97  98  99 100 101 102 103 104 112 113 114 116 118 119 121]

fig, axs = plt.subplots(8, 16, figsize=(25, 15))
axs = axs.flatten()
xvals = np.linspace(-0.2, 1, 120)
for i in range(128):
    plotting.plot_filled_sem(Y_mats[i], xvals, ax= axs[i], xlabel="", ylabel="",ylim=(-1,2.5), color='b')
    if i in responsive_chans:
        axs[i].text(0.55,0.8, '*', color='r')
for i, ax in enumerate(axs):
    ax.set(yticklabels=[], xticklabels=[], xticks=[0, 1])
    _ = ax.text(0.55, 0.85, str(i+1), transform=ax.transAxes)    
_ = axs[-16].set(yticks=[-1, 0, 1, 2], yticklabels=[-1, 0, 1, 2], xticklabels=[0, 1], xlabel="Time (s)", ylabel="High-gamma \n(z-score across block)")

fig = plt.figure(figsize=(8, 4))
gs_brains = matplotlib.gridspec.GridSpec(1, 2, width_ratios=[0.7, 1], hspace=0.1)
ax_brain = plt.subplot(gs_brains[0])
ax_inset = plt.subplot(gs_brains[1])
plotting.brain_inset([ax_brain, ax_inset], subject='HS11', response_channels=responsive_chans)

from asccd import temporal_receptive_field as trf
delays = trf.get_delays(delay_seconds=0.4)
stim_resp = trf.get_trf_stim_resp(all_hgs, all_times, all_names)

all_spec_features = stim_resp[0] # length 224,  first item has shape (165, 30)
all_intensity_features = stim_resp[1] # length 224,  first item has shape (165, 1)
all_binary_pitch_features = stim_resp[2] # length 224,  first item has shape (165, 1)
all_abs_pitch_features = stim_resp[3] # length 224,  first item has shape (165, 10)
all_rel_pitch_features = stim_resp[4] # length 224,  first item has shape (165, 10)
all_pitch_change_features = stim_resp[5] # length 224,  first item has shape (165, 10)
all_Y = stim_resp[6] # length 224,  first item has shape (165, 128)

def get_bin_edges_percent_range(a, bins=10, percent=95):
    assert percent > 1 
    assert percent < 100
    tail_percentage = (100 - percent)/2
    a = a[~np.isnan(a)]
    a_range = np.percentile(a, [tail_percentage, 100-tail_percentage])
    counts, bin_edges = np.histogram(a, bins=bins, range=a_range)
    return bin_edges
    
corpus_pitch = timit.get_timit_annotations()
abs_bin_edges = get_bin_edges_percent_range(corpus_pitch['abs_pitch'])

abs_bin_edges.shape,abs_bin_edges

((11,),
 array([-1.56828262, -1.21747101, -0.8666594 , -0.51584779, -0.16503618,
         0.18577543,  0.53658704,  0.88739865,  1.23821026,  1.58902187,
         1.93983349]))

features_full, Ys = trf.concatenate_trf_stim_resp(stim_resp, exclude=None)
features_full.shape, Ys.shape

((42010, 62), (42010, 128))

test_corr_folds_chin, wts_folds_chin, best_alphas = trf.run_cv_temporal_ridge_regression_model(features_full, Ys, delays=delays, add_edges=False)
r2_full = np.mean(test_corr_folds_chin**2, axis=0) # r2 metric see more in paper section "Encoding models".
wts_full = np.mean(wts_folds_chin, axis=0)

Running fold 0. Running fold 1. Running fold 2. Running fold 3. Running fold 4.

wts_folds_chin.shape

(5, 2480, 128)

wts1_labels = {'yticks':(0, 3, 6, 9), 'yticklabels':(5.7, 0.2, -5.3, -10.8), 'ylabel':"Pitch change (oct/s)"}

fig = trf.plot_trf(wts_full.T, 69, wts1_label=wts1_labels, wts_shape=(40, 62), wts1=(52, 62), wts2=(42, 52), min_max=(42,62), edges_added=False, figsize=(10,3))
fig = trf.plot_trf(wts_full.T, 85, wts1_label=wts1_labels, wts_shape=(40, 62), wts1=(52, 62), wts2=(42, 52), min_max=(42,62), edges_added=False, figsize=(10,3))

import torch
import torchaudio
from transformers import Wav2Vec2ForCTC,Wav2Vec2Processor
import os
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base")

/ssd9/exec/penglinkai/miniconda3/envs/pytorch/lib/python3.8/site-packages/transformers/configuration_utils.py:380: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.
  warnings.warn(

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

timit_blocks = [1,2,3,4,5]
timit_names = []
for i in range(len(timit_blocks)):
    timit_names.append(timit.get_timit_names_for_timit_block(timit_blocks[i]))
fig, ax = plt.subplots(1,1,figsize=(12,2))
cmaps = ['Greys','Purples', 'Blues', 'Greens', 'Oranges', 'Reds']
##### top
timit_name = timit_names[4][1]
[fs_t, sig_t] = wavfile.read(timit.get_wavpath(timit_name))
xvals_t = np.arange(len(sig_t))/fs_t

phon_times = np.asarray(timit_phonemes.loc[timit_name][timit_phonemes.loc[timit_name]['silence']==False]['start_time'])
phon_names = np.asarray(timit_phonemes.loc[timit_name][timit_phonemes.loc[timit_name]['silence']==False]['phn'])
phon_time_idx = util.time_to_index(phon_times, hz=fs_t)

syll_times = np.asarray(timit_syllables.loc[timit_name]['start_times'])

ax.plot(xvals_t, sig_t, linewidth=0.5, color=np.array([0.7,0.7,0.7]))

height = 41000
ax.text(syll_times[0], height, 'He', fontsize=10)
ax.text(syll_times[1], height, 'moistened', fontsize=10)
ax.text(syll_times[3], height, 'his', fontsize=10)
ax.text(syll_times[4], height, 'lips', fontsize=10)
ax.text(syll_times[5], height, 'uneasily.', fontsize=10)

for i in range(len(phon_times)):
    if i != 13:
        ax.text(phon_times[i]+0.001, 34500, phon_names[i], fontsize=9)
        ax.axvline(phon_times[i], color=[0.6,0.6,0.6], ls='--')
    
for i in range(len(syll_times)):
    ax.axvline(syll_times[i], color=[0.3,0.3,0.3], ls='-')

ax.axis("off")
xlim = (0, xvals_t[-1])
ax.set(xlim=xlim)
ax.set(ylim=[-25000,35000])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

ax2 = ax.twinx() 
fs = 100
df = timit_annotation.loc[timit_names[4][1]]
xvals = np.arange(len(df))/fs
ax2.plot(xvals, df['pitch'],color=np.array([0.3,0.3,0.9]),linewidth=3)
#ax2.set(xticks=[0,100,200])
#ax2.set(xticklabels=[0,1,2])
#ax.set(yticks=[180,220,260])
#ax.spines['bottom'].set_color('white')
ax2.spines['top'].set_color('white') 
ax2.spines['right'].set_color('white')
ax2.axis("off")

ax.arrow((phon_times[7]+phon_times[6])*0.5, 16000, 0, -8000, head_width=0.03, head_length=4000, fc='k', ec='k')

# phoneme(0)
ax.add_patch(Rectangle((phon_times[6], 20000), phon_times[7]-phon_times[6], 4000, color=plt.get_cmap(cmaps[0])(0.7)))
# phoneme(-1)
ax.add_patch(Rectangle((phon_times[5], 20000), phon_times[6]-phon_times[5], 4000, color=plt.get_cmap(cmaps[1])(0.7)))
# phoneme(-2)
ax.add_patch(Rectangle((phon_times[4], 20000), phon_times[5]-phon_times[4], 4000, color=plt.get_cmap(cmaps[2])(0.7)))

# syllable(0)
ax.add_patch(Rectangle((syll_times[2], 27000), phon_times[6]-syll_times[2], 4000, color=plt.get_cmap(cmaps[3])(0.7)))
ax.add_patch(Rectangle((phon_times[7], 27000), syll_times[3]-phon_times[7], 4000, color=plt.get_cmap(cmaps[3])(0.7)))

# syllable(-1)
ax.add_patch(Rectangle((syll_times[1], 27000), syll_times[2]-syll_times[1], 4000, color=plt.get_cmap(cmaps[4])(0.7)))
# syllable(-2)
ax.add_patch(Rectangle((syll_times[0], 27000), syll_times[1]-syll_times[0], 4000, color=plt.get_cmap(cmaps[5])(0.7)))
fig.savefig("sample.png")

latent_feature_cnn = []  #[layers][block][sentence][time x feature]
latent_feature_ext = []  #[]
latent_feature_proj = []
latent_feature_encoder = []
latent_feature_en = []

latent_attention = []

for i in range(len(model.wav2vec2.feature_extractor.conv_layers)+1):
    latent_feature_cnn.append([])
    
for i in range(len(model.wav2vec2.encoder.layers)+1):
    latent_feature_encoder.append([])
    
for i in range(len(model.wav2vec2.encoder.layers)):
    latent_attention.append([])

for i in range(len(timit_names)):
    for j in range(len(latent_feature_cnn)):
        latent_feature_cnn[j].append([])
    for j in range(len(latent_feature_encoder)):
        latent_feature_encoder[j].append([])
    for j in range(len(latent_attention)):
        latent_attention[j].append([])
    latent_feature_ext.append([])
    latent_feature_proj.append([])
    latent_feature_en.append([])
    for j in range(len(timit_names[i])):
        sound_file = timit.get_wavpath(timit_names[i][j])
        speech_array, sampling_rate = torchaudio.load(sound_file)

        # feature extraction
        lat = []
        lat.append(speech_array.unsqueeze_(0))
        
        for k in range(len(model.wav2vec2.feature_extractor.conv_layers)):
            lat.append(model.wav2vec2.feature_extractor.conv_layers[k](lat[k])) 
        for k in range(len(lat)):
            latent_feature_cnn[k][i].append(np.squeeze(lat[k].cpu().detach().numpy()))

        lat_p, ext_feat = model.wav2vec2.feature_projection(lat[-1].permute(0,2,1))
        lat_e = model.wav2vec2.encoder(lat_p, output_hidden_states=True, output_attentions=True)
        
        # encoder layers
        for k in range(len(lat_e.hidden_states)):
            latent_feature_encoder[k][i].append(np.squeeze(lat_e.hidden_states[k].cpu().detach().numpy()))
            
        for k in range(len(lat_e.attentions)):
            latent_attention[k][i].append(np.squeeze(lat_e.attentions[k].cpu().detach().numpy()))

        latent_feature_ext[i].append(np.squeeze(lat[-1].cpu().detach().numpy()).T)
        latent_feature_proj[i].append(np.squeeze(lat_p.cpu().detach().numpy()))
        latent_feature_en[i].append(np.squeeze(lat_e.last_hidden_state.cpu().detach().numpy()))

feature_names = ['fs_ext', 'fs_proj', 'encoder0', 'encoder1', 'encoder2', 'encoder3', 'encoder4', 'encoder5', 
                 'encoder6', 'encoder7', 'encoder8', 'encoder9', 'encoder10', 'encoder11', 'encoder12', 
                 'hg', 'spectrogram', 'intensity', 'bin_pitch', 'abs_pitch', 'rel_pitch', 
                 'pitch_change', 'phonetics']
nn_features = ['fs_ext', 'fs_proj', 'encoder0', 'encoder1', 'encoder2', 'encoder3', 'encoder4', 'encoder5', 
                 'encoder6', 'encoder7', 'encoder8', 'encoder9', 'encoder10', 'encoder11', 'encoder12']

all_latent_features = {'fs_ext': latent_feature_ext,
                  'fs_proj': latent_feature_proj,
                  'encoder0': latent_feature_encoder[0], 
                  'encoder1': latent_feature_encoder[1], 
                  'encoder2': latent_feature_encoder[2], 
                  'encoder3': latent_feature_encoder[3], 
                  'encoder4': latent_feature_encoder[4], 
                  'encoder5': latent_feature_encoder[5], 
                  'encoder6': latent_feature_encoder[6], 
                  'encoder7': latent_feature_encoder[7], 
                  'encoder8': latent_feature_encoder[8], 
                  'encoder9': latent_feature_encoder[9], 
                  'encoder10': latent_feature_encoder[10], 
                  'encoder11': latent_feature_encoder[11],
                  'encoder12': latent_feature_encoder[12]
                 }

stim_resp = trf.get_all_features(all_hgs, all_times, all_names, feature_names, nn_features, all_latent_features)
feat_mat = trf.concatenate_all_features(stim_resp, feature_names)

def get_phoneme_masks(timit_name, length):
    phon_times = np.asarray(timit_phonemes.loc[timit_name][timit_phonemes.loc[timit_name]['silence']==False]['start_time'])
    phon_time_idx = util.time_to_index(phon_times, hz=49)
    phon_mask_idx = np.concatenate([[0], phon_time_idx, [length]])

    phon_masks = []

    phon_masks.append(np.zeros((length, length)))
    for k in range(len(phon_mask_idx)-1):  # phoneme 0
        phon_masks[0][phon_mask_idx[k]:phon_mask_idx[k+1], phon_mask_idx[k]:phon_mask_idx[k+1]] = 1

    phon_masks.append(np.zeros((length, length)))  # phoneme -1
    for k in range(len(phon_mask_idx)-2):
        phon_masks[-1][phon_mask_idx[k+1]:phon_mask_idx[k+2], phon_mask_idx[k]:phon_mask_idx[k+1]] = 1

    phon_masks.append(np.zeros((length, length)))  # phoneme -2
    for k in range(len(phon_mask_idx)-3):
        phon_masks[-1][phon_mask_idx[k+2]:phon_mask_idx[k+3], phon_mask_idx[k]:phon_mask_idx[k+1]] = 1  

    return phon_masks

def get_syllable_masks(timit_name, length):
# syllable masks
    syll_times = np.asarray(timit_syllables.loc[timit_name]['start_times'])
    syll_time_idx = util.time_to_index(syll_times, hz=49)
    syll_mask_idx = np.concatenate([[0], syll_time_idx, [length]])

    syll_masks = []

    syll_masks.append(np.zeros((length, length)))
    for k in range(len(syll_mask_idx)-1):  # syllable 0
        syll_masks[0][syll_mask_idx[k]:syll_mask_idx[k+1], syll_mask_idx[k]:syll_mask_idx[k+1]] = 1

    syll_masks.append(np.zeros((length, length)))
    for k in range(len(syll_mask_idx)-2):  # syllable -1
        syll_masks[-1][syll_mask_idx[k+1]:syll_mask_idx[k+2], syll_mask_idx[k]:syll_mask_idx[k+1]] = 1

    syll_masks.append(np.zeros((length, length)))
    for k in range(len(syll_mask_idx)-3):  # syllable -2
        syll_masks[-1][syll_mask_idx[k+2]:syll_mask_idx[k+3], syll_mask_idx[k]:syll_mask_idx[k+1]] = 1
    return syll_masks

def get_masks(timit_name, length):
    phon_masks = get_phoneme_masks(timit_name, length)
    syll_masks = get_syllable_masks(timit_name, length)
    
    syll_masks[0] = syll_masks[0] - phon_masks[0]
    
    all_masks = np.concatenate([phon_masks, syll_masks], axis=0)
    return all_masks

length = latent_attention[0][4][1][0].shape[0]
phon_masks = get_phoneme_masks(timit_names[4][1], length)
syll_masks = get_syllable_masks(timit_names[4][1], length)
matplotlib.rcParams.update({'font.size': 16})

cmaps = ['Greys','Purples', 'Blues', 'Greens', 'Oranges', 'Reds']

all_masks = get_masks(timit_names[4][1], length)

fig, axs = plt.subplots(1,6, figsize=(25,5))
axs = axs.flatten()
for i in range(len(axs)):
    axs[i].imshow(all_masks[i]*0.5, cmap=cmaps[i])
    axs[i].plot([0, 1], [1, 0], transform=axs[i].transAxes, color=[0.5,0.5,0.5])
    axs[i].set(xticklabels=[], yticklabels=[], xlabel='key', ylabel='query')
    for im in plt.gca().get_images():
        im.set_clim(0, 1)
    
axs[0].set(title='attention patterns\n phoneme(t-0)')
axs[1].set(title='phoneme(t-1)')
axs[2].set(title='phoneme(t-2)')
axs[3].set(title='syllable(t-0)')
axs[4].set(title='syllable(t-1)')
axs[5].set(title='syllable(t-2)')

[Text(0.5, 1.0, 'syllable(t-2)')]

Dep. Variable:	bin_user_ans	No. Observations:	42305
Model:	Probit	Df Residuals:	42019
Method:	MLE	Df Model:	285
Date:	Tue, 07 Nov 2023	Pseudo R-squ.:	0.06204
Time:	11:10:08	Log-Likelihood:	-21555.
converged:	True	LL-Null:	-22981.
Covariance Type:	nonrobust	LLR p-value:	0.000

		time	pitch	intensity	log_hz	erb_rate	rel_pitch_global	rel_pitch_global_erb	abs_pitch	abs_pitch_erb	abs_pitch_change	...	front	low	back	plosive	fricative	syllabic	nasal	voiced	obstruent	sonorant
fadg0_si1279	0	0.01	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	1	0.02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	2	0.03	NaN	32.27	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	3	0.04	NaN	32.68	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	4	0.05	NaN	30.99	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
mzmb0_si1796	186	1.87	NaN	33.69	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	187	1.88	NaN	32.45	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	188	1.89	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	189	1.90	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
	190	1.91	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0

		start_times	end_times	syllable_phns
timit_name	syllable_index
mmds0_si1973	0	0.132250	0.230187	dh ax-h q
	1	0.230187	0.260813	eh
	2	0.260813	0.318500	nx ih
	3	0.318500	0.458813	m iy
	4	0.458813	0.672687	dcl d ih dcl
...	...	...	...	...
mbma1_si2207	0	0.143063	0.337375	ae z
	1	0.337375	0.484500	w iy
	2	0.484500	0.720000	ey tcl
	3	0.720000	0.827000	w iy
	4	0.827000	1.317937	tcl t ao kcl t

		start_time	end_time	phn	silence
timit_name	phoneme_index
makr0_si1352	0	0.000000	0.150000	h#	True
	1	0.150000	0.250625	ae	False
	2	0.250625	0.375625	s	False
	3	0.375625	0.496750	ah	False
	4	0.496750	0.513000	tcl	False
...	...	...	...	...	...
mgaw0_si535	23	1.612500	1.652500	k	False
	24	1.652500	1.741500	axr	False
	25	1.741500	1.785875	ix	False
	26	1.785875	1.857500	ng	False
	27	1.857500	2.115000	h#	True

Part One: Do self-supervised speech models develop human-like perception biases?¶

背景¶

方法¶

准备¶

人类感知语音¶

自监督模型感知语音¶

比较人类和自监督模型在语音感知上的区别¶

The log-likelihood¶

The Spearman correlation (ρ)¶

Native language effect¶

结果¶

Part Two: Dissecting neural computations of the human auditory pathway using deep neural networks for speech¶

Speech Stimuli¶

Brain Recordings¶

Find Speech Responsive Channels¶

Time-delayed linear encoding models (TRF models)¶

Attention pattern analysis¶

	triplet_id	phone_TGT	phone_OTH	prev_phone	next_phone	language_OTH	language_TGT	dataset	user_ans	wav2vec_audioset_transf4
0	BR_TRIP10222_0	e	ĩ	s	k	BR	BR	WorldVowels	3.333333e-01	0.031396
1	BR_TRIP10222_1	e	ĩ	s	k	BR	BR	WorldVowels	2.000000e-01	0.031396
2	BR_TRIP10726_0	i	ɛ	d	s	BR	BR	WorldVowels	9.333333e-01	0.054557
3	BR_TRIP10726_1	i	ɛ	d	s	BR	BR	WorldVowels	9.333333e-01	0.054557
4	BR_TRIP10775_0	i	e	d	s	BR	BR	WorldVowels	-6.666667e-02	0.019795
...	...	...	...	...	...	...	...	...	...	...
8456	triplet_FR995	y	l	t	e	FR	FR	zerospeech	4.440892e-17	0.034653
8457	triplet_FR996	y	l	t	e	FR	FR	zerospeech	4.440892e-17	0.034653
8458	triplet_FR997	y	l	p	d	FR	FR	zerospeech	-6.666667e-02	-0.024223
8459	triplet_FR998	y	l	p	d	FR	FR	zerospeech	2.666667e-01	-0.024223
8460	triplet_FR999	y	l	s	i	FR	FR	zerospeech	7.333333e-01	0.027158

	phone_TGT	phone_OTH	prev_phone	next_phone	language_OTH	language_TGT	dataset	user_ans	wav2vec_audioset_transf4
0	a	aː	f	f	GL	GL	WorldVowels	0.522222	0.025759
1	a	aː	g	g	GL	GL	WorldVowels	0.322222	0.044892
2	a	aː	p	p	GL	GL	WorldVowels	0.200000	-0.005515
3	a	e	d	k	FR	FR	zerospeech	-0.066667	0.011050
4	a	e	i	d	FR	FR	zerospeech	0.500000	0.054048
...	...	...	...	...	...	...	...	...	...
3396	θ	s	a	ɑ	EN	EN	pilot-aug-2018	-0.600000	0.009272
3397	θ	s	i	i	EN	EN	pilot-aug-2018	1.000000	0.017195
3398	θ	t	i	i	EN	EN	pilot-aug-2018	0.400000	0.055567
3399	θ	ʃ	a	ɑ	EN	EN	pilot-aug-2018	1.000000	0.010789
3400	θ	ʃ	i	i	EN	EN	pilot-aug-2018	0.200000	0.011312

	phone_TGT	phone_OTH	language_OTH	language_TGT	dataset	user_ans	wav2vec_audioset_transf4
0	a	aː	GL	GL	WorldVowels	0.348148	0.021712
1	a	e	FR	FR	zerospeech	0.311111	0.036211
2	a	i	EN_DR2	EN_DR2	pilot-july-2018	0.466667	0.033544
3	a	i	EN_DR3	EN_DR3	pilot-july-2018	0.200000	0.073582
4	a	i	EN_DR7	EN_DR7	pilot-july-2018	0.200000	0.090170
...	...	...	...	...	...	...	...
1280	θ	k	EN	EN	pilot-aug-2018	0.600000	0.003462
1281	θ	p	EN	EN	pilot-aug-2018	0.600000	0.023903
1282	θ	s	EN	EN	pilot-aug-2018	0.200000	0.013234
1283	θ	t	EN	EN	pilot-aug-2018	0.400000	0.055567
1284	θ	ʃ	EN	EN	pilot-aug-2018	0.600000	0.011051

	phone_TGT	phone_OTH	language_OTH	language_TGT	dataset	user_ans	wav2vec_audioset_transf4
0	a	aː	GL	GL	WorldVowels	0.201852	0.003995
1	a	e	FR	FR	zerospeech	0.438889	0.009876
2	a	i	EN_DR2	EN_DR2	pilot-july-2018	0.633333	0.053046
3	a	i	EN_DR3	EN_DR3	pilot-july-2018	0.200000	0.073582
4	a	i	EN_DR4	EN_DR4	pilot-july-2018	0.600000	0.068183
...	...	...	...	...	...	...	...
1319	θ	k	EN	EN	pilot-aug-2018	0.700000	0.010990
1320	θ	p	EN	EN	pilot-aug-2018	0.600000	0.018190
1321	θ	s	EN	EN	pilot-aug-2018	0.600000	0.022778
1322	θ	t	EN	EN	pilot-aug-2018	0.500000	0.038148
1323	θ	ʃ	EN	EN	pilot-aug-2018	0.600000	0.038617