#### libraries ####
import os
import mne
import numpy as np
import matplotlib.pyplot as plt
import pickle
import pandas as pd
from scipy import signal
from scipy.io import wavfile
# from pybv import write_brainvision
from pyprep.prep_pipeline import PrepPipeline
from mne_icalabel import label_components
Laryngeal Preprocessing Pipeline
1 Libraries
2 Trigger latency correction
Fixing trigger timing & renaming events
2.1 parameters
# directory
= os.getcwd() + '/../raw data/eeg_brainvision/'
input_dir = os.getcwd() + '/../preprocessed/1_trigger_lag_corrected/'
output_dir # create a folder if the folder doesn't exist
=True)
os.makedirs(output_dir, exist_ok
# create a dictionary for blocks and block markers
= {
block_dict 'dorsal_highStan_lowDevi': 1000, # standard: dorsal_high; deviant: dorsal_low
'dorsal_lowStan_highDevi': 2000, # standard: dorsal_low; deviant: dorsal_high
'glottal_highStan_lowDevi': 3000, # standard: glottal_high; deviant: glottal_low
'glottal_lowStan_highDevi': 4000, # standard: glottal_low; deviant: glottal_high
}
# exclude participants with missing data
= [
exclude_ppts '0027', # vot, incomplete eeg recording due to wrong sound library setting
'0028', # f0, missing block 3 and 4
'0030', # missing block 3 and 4
'0034', # missing block 3
'0040', # f0, missing block 3 and 4
'0046', # f0, missing block 2,3,4
]
# trigger searching window (actual trigger time based on audio - trigger time in the data)
= -0.01
t_left = 0.4 # long latency due to using pygame sound library t_right
#######################################################
#### create a dictionary for trigger codes and descriptions ####
= pd.read_csv("vot_mapping.txt", delimiter='\t')
df = dict(zip(df['code'], df['description'])) # key = 'id', value = 'name'
VOT_mapping
= pd.read_csv("f0_mapping.txt", delimiter='\t')
df = dict(zip(df['code'], df['description'])) # key = 'id', value = 'name'
F0_mapping #######################################################
# initialize a dictionary to save bad stims
= {}
all_bad_stim_dict
# read in eeg data files
= os.listdir(input_dir)
all_files
#### for each file, create a dictionary to store each block and the indices of trials of that block ####
for file in all_files:
if file.endswith('.vhdr') and (file.split('_')[1] not in exclude_ppts) and (file.split('.')[0]+ '_corr.fif' not in os.listdir(output_dir)):
# read in vhdr files
= mne.io.read_raw_brainvision(input_dir + file, preload = True)
raw
# If the aux channel is not named 'StimTrak', change the channel name to 'StimTrak'
if raw.info['ch_names'][31] != 'StimTrak':
# change the channel name
'ch_names'][31]: 'StimTrak'})
raw.rename_channels({raw.info[# specify the audio channel
'StimTrak':'misc'})
raw.set_channel_types({
# extract the info of experiment version
= file.split('.')[0].split('_')[2]
exp_ver
# extract sampling rate
= raw.info['sfreq']
eeg_sfreq
#############################################################################################
#### create a dictionary for each stim's standard version and deviant version of markers ####
# intialize a dictionary for triggers
= {}
trigger_dict
# read in trigger_codes file, and create a standard trigger code and a deviant triggre code for each stimulus
with open('../experiment programs/'+exp_ver+'_eeg/trigger_codes.txt','r') as tf:
for line in tf:
# read in the current line
= line.replace('\n','')
line # separate fileNames and triggerMarker
= line.split('\t')
label, marker # convert trigger markers to integer
= int(marker)
marker # create label for stims used as standards
= label + '-s'
trigger_dict[marker] # add 100 for deviant marker
= marker + 100
marker_deviant # create label for stims used as deviants
= label + '-d'
trigger_dict[marker_deviant] #############################################################################################
########################
#### get audio data ####
# initialize dictionaries
= {}
audio = {}
lengths
# del trigger_dict[99999]
# for each trigger code and label in the trigger dictionary
for marker,label in trigger_dict.items():
# extract file name, up to the -2 character of the label
= label[:-2]
file_name # if not already in audio dictionary, get the info of the audio file
if file_name not in audio:
# get sample rate and data of the audio file
= wavfile.read('../experiment programs/'+exp_ver+'_eeg/stimuli/{}.wav'.format(file_name))
sampleRate, data # calculate sound file length
= len(data)/sampleRate
lengths[file_name] # reduce the sampling rate of the audio file by the factor of int(sampleRate/eeg_sfreq)
= signal.decimate(data, int(sampleRate/eeg_sfreq), ftype='fir')
data_downsampled # add info the audio dictionary
= data_downsampled
audio[file_name] ########################
##################################################################
#### recode trigger marker to reflect condition and stim info ####
# for each stimulus, mark the block info
= mne.events_from_annotations(raw, verbose='WARNING')
events_from_annot, event_dict # remove the New Segment marker (which has a trigger value of 99999) and pause marker (222) and continue marker (223)
= events_from_annot[events_from_annot[:, 2] < 200]
events_from_annot
# initialize the train for each standards+deviant sequence
= np.array([]).astype(int)
train
= {}
all_block = True # whether the standard in a train has been noted
isStandard
# loop over each trigger
for i in range(events_from_annot.shape[0]):
# add the current token to the train
= np.append(train,i)
train
# if the trigger value is smaller than 100 and the standard toggle is true
if (events_from_annot[i,2]<100) & isStandard:
# split the file name
= trigger_dict[events_from_annot[i,2]].split('_')
trigger_splitted # get the standard category name
= trigger_splitted[1] + '_' + trigger_splitted[2] + 'Stan'
block # toggle standard
= False
isStandard
# if the trigger value is over 100
elif events_from_annot[i,2]>100:
# split the file name
= trigger_dict[events_from_annot[i,2]].split('_')
trigger_splitted # append the deviant stim category
= block + '_' + trigger_splitted[2] + 'Devi'
block
# if the block is not present in all block
if block not in all_block:
# add the new block and the token idx
= train[2:] # [2:] to exclude the first 2 standards
all_block[block] else:
# add the token idx to the existing block
= np.concatenate([all_block[block],train[2:]],axis = None)
all_block[block]
# reset train
= np.array([]).astype(int)
train
# toggle standard note
= True
isStandard
# loop over each block and its trigger index
for k,v in all_block.items():
# recode the trigger value to reflect block and stim category
tuple(v),2] += block_dict[k]
events_from_annot[
# exclude the first 2 standards
= events_from_annot[events_from_annot[:,2]>1000]
events_from_annot ##################################################################
######################################################
#### cross correction to detect the trigger delay ####
# initialize delay list
= np.array([])
delays # initialize bad stim list
= []
bad_stim = []
corr_results
# loop over each event
for i in range(events_from_annot.shape[0]):
# get current event info [time, duration, annotation]
= events_from_annot[i]
event # get the onset latency (s) of the event
= event[0]/eeg_sfreq
time # get the file name of the event
= trigger_dict[event[2]%100].split('-')[0]
name # get the data from the sound channel
= raw.get_data(
audio_eeg = ['StimTrak'],
picks = time + t_left,
tmin = time + lengths[name] + t_right,
tmax 0]
)[# actual stimulus
= audio[name]
audio_stim # Z-score normalization (subtract mean, divide by std)
= (audio_eeg - np.mean(audio_eeg)) / np.std(audio_eeg)
audio_eeg = (audio_stim - np.mean(audio_stim)) / np.std(audio_stim)
audio_stim
# cross-correlation
= signal.correlate(audio_eeg, audio_stim, mode='full')
corr # Normalize cross-correlation
= corr / (np.linalg.norm(audio_eeg) * np.linalg.norm(audio_stim))
corr # Find peak correlation value
= np.max(corr)
max_corr
# if the maximum correction (sum of products) is less than a threshold (empirical threshold, 0.5 is generally good)
if max_corr < 0.5:
# mark the stim bad
bad_stim.append(i)
# append the maximum correlation
corr_results.append(max_corr)
# the lags for cross-correlation
= signal.correlation_lags(
lags
audio_eeg.size,
audio_stim.size,="full")
mode# get the lag of the maximum cross-correlation
= lags[np.argmax(corr)] + t_left*eeg_sfreq
lag
# save the lag for non-starting events
= np.append(delays,lag)
delays ######################################################
##################################################################################################
#### plot the wave from the stim track and the eeg channel of the token with the minimum corr ####
= np.argmin(corr_results)
min_corr # get current event info [time, duration, annotation]
= events_from_annot[min_corr]
event # get the onset latency (s) of the event
= event[0]/eeg_sfreq
time # get the file name of the event
= trigger_dict[event[2]%100].split('-')[0]
name # get the data from the sound channel
= raw.get_data(
audio_eeg = ['StimTrak'],
picks = time + t_left,
tmin = time + lengths[name] + t_right,
tmax 0]
)[# actual stimulus
= audio[name]
audio_stim # Z-score normalization (subtract mean, divide by std)
= (audio_eeg - np.mean(audio_eeg)) / np.std(audio_eeg)
audio_eeg = (audio_stim - np.mean(audio_stim)) / np.std(audio_stim)
audio_stim # plot
= plt.subplots()
fig, ax = 'StimTrak', alpha = 0.6)
ax.plot(audio_eeg, label = 'wave', alpha = 0.6)
ax.plot(audio_stim, label file)
ax.set_title(
ax.legend()+ file.split('.')[0] + "_minCor.png", dpi=300, bbox_inches='tight')
fig.savefig(output_dir ##################################################################################################
################################################
#### correct for trigger lag and save files ####
# add number of bad stim info
file] = len(bad_stim)
all_bad_stim_dict[
# remove items associated with bad stims from the event list
= np.delete(events_from_annot, bad_stim, 0)
events_from_annot
# remove items associated with bad stims from the delay list
= np.delete(delays, bad_stim, 0)
delays
# add delay back to the onset latency of each event
0] = events_from_annot[:,0] + delays
events_from_annot[:,
# convert individual event marker to conditions
# events_from_annot[:,2] = events_from_annot[:,2] - events_from_annot[:,2]%100
# create annotations
= mne.annotations_from_events(
annot_from_events = events_from_annot,
events = eval(exp_ver + '_mapping'),
event_desc = eeg_sfreq
sfreq
)
# set annotations
raw.set_annotations(annot_from_events)
# drop the audio channel in data
'StimTrak'])
raw.drop_channels([
# save as a file-into-file data
+ file.split('.')[0]+ '_corr.fif')
raw.save(output_dir
# save lag data
+ file.replace('.vhdr', '_delays.txt'), delays, fmt='%i')
np.savetxt(output_dir ################################################
# save the number of bad stims of all participant
with open(output_dir + 'bad_stim.txt', 'a') as f:
for key, value in all_bad_stim_dict.items():
if value > 0:
+ '\t' + str(value) + '\n') f.write(key
3 Bad channel correction
- filtering
- resampling
- remove line noise
- bad channel detection & repairing
- add back reference channel TP9
3.1 parameters
# set directory
= os.getcwd() + '/../preprocessed/1_trigger_lag_corrected/'
input_dir = os.getcwd() + '/../preprocessed/2_bad_channel_corrected/'
output_dir # create a folder if the folder doesn't exist
=True)
os.makedirs(output_dir, exist_ok
# filter cutoff frequencies (low/high)
= 1
f_low = 100
f_high
# resampling frequency
= 250
f_res
# line frequency
= 60
line_freq
# preprocessing parameters
= {
prep_params "ref_chs": 'eeg',
"reref_chs": 'eeg', # average re-reference
"line_freqs": np.arange(line_freq, f_res/2, line_freq),
}
# create a montage file for the pipeline
= mne.channels.make_standard_montage("standard_1020")
montage
# interpolation method
# method=dict(eeg="spline")
#####################################################
#### Preprocessing (filtering, resampling, bad channel detection/interpoloation, re-reference) ####
#####################################################
# get all file namesin the folder
= os.listdir(input_dir)
all_input = os.listdir(output_dir)
all_output
for file in all_input:
if file.endswith("corr.fif") and (file.split('.')[0]+ '_prep.fif' not in all_output):
# read in file
= mne.io.read_raw_fif(input_dir + file, preload=True)
raw
# set channel type
'Fp1':'eog', 'Fp2':'eog'})
raw.set_channel_types({
# filter
filter(l_freq = f_low, h_freq = f_high)
raw.
#### cut off the beginning and ending part ####
# get the onset of the first and the last event ####
= mne.events_from_annotations(raw, verbose='WARNING')
events_from_annot, event_dict
# define the beginning time (in seconds)
= events_from_annot[0][0]/raw.info['sfreq'] - 10
crop_start
# define the ending time (in seconds)
= events_from_annot[-1][0]/raw.info['sfreq'] + 10
crop_end
# crop the data
raw.crop(=max(crop_start, raw.times[0]),
tmin=min(crop_end, raw.times[-1])
tmax
)
# resample
= f_res)
raw.resample(sfreq
# read in channel location info
raw.set_montage(montage)
#### Use PrePipeline to preprocess ####
'''
1. detect and interpolate bad channels
2. remove line noise
3. re-reference
'''
# apply pyprep
= PrepPipeline(raw, prep_params, montage, random_state=42)
prep
prep.fit()
# export a txt file for the interpolated channel info
with open(output_dir + 'bad_channel.txt', 'a+') as f:
=f.write(
_ file + ':\n' +
"- Bad channels original: {}".format(prep.noisy_channels_original["bad_all"]) + '\n' +
"- Bad channels after robust average reference: {}".format(prep.interpolated_channels) + '\n' +
"- Bad channels after interpolation: {}".format(prep.still_noisy_channels) + '\n'
)
# save the pyprep preprocessed data
= prep.raw
raw
# add back the reference channel
= mne.add_reference_channels(raw,'TP9')
raw
# add the channel loc info (for the newly added reference channel)
raw.set_montage(montage)
# save
+ file.split('.')[0]+ '_prep.fif') raw.save(output_dir
4 ICA bad trial correction
4.1 parameters
# directory
= os.getcwd() + '/../preprocessed/2_bad_channel_corrected/'
input_dir = os.getcwd() + '/../preprocessed/3_ica/'
output_dir # create a folder if the folder doesn't exist
=True)
os.makedirs(output_dir, exist_ok
# up to which IC you want to consider
= 15
ic_upto # ic_upto = 99
# get all file names in the folder
= os.listdir(input_dir)
all_input = os.listdir(output_dir)
all_output
# initialize a dictionary for files
for file in all_input:
if file.endswith("prep.fif") and (file.split('.')[0]+ '_ica.fif' not in all_output):
# read in file
= mne.io.read_raw_fif(input_dir + file, preload=True)
raw
# make a filtered file copy ICA. It works better on signals with 1 Hz high-pass filtered and 100 Hz low-pass filtered
= raw.copy().filter(l_freq = 1, h_freq = 100)
raw_filt
# apply a common average referencing, to comply with the ICLabel requirements
"average")
raw_filt.set_eeg_reference(
# initialize ica parameters
= mne.preprocessing.ICA(
ica # n_components=0.999999,
='auto', # n-1
max_iter# use ‘extended infomax’ method for fitting the ICA, to comply with the ICLabel requirements
= 'infomax',
method = dict(extended=True),
fit_params = 42,
random_state
)
# get ica solution
= ['eeg'])
ica.fit(raw_filt, picks
# save ica solutions
+ file.split('.')[0]+ '_icaSolution.fif')
ica.save(output_dir
# use ICLabel for automatic IC labeling
= label_components(raw_filt, ica, method="iclabel")
ic_labels
# save
with open(output_dir + file.split('.')[0]+ '_icLabels.pickle', 'wb') as f:
pickle.dump(ic_labels, f)
# exclude bad IC
= ic_labels["labels"]
labels = [
exclude_idx for idx, label in enumerate(labels) if idx<ic_upto and label not in ["brain", "other"]
idx
]
# ica.apply() changes the Raw object in-place
apply(raw, exclude=exclude_idx)
ica.
# record the bad ICs in bad_ICs.txt
with open(output_dir + '/bad_ICs.txt', 'a+') as f:
= f.write(file + '\t' + str(exclude_idx) + '\n')
_
# save data
+ file.split('.')[0]+ '_ica.fif') raw.save(output_dir
5 ERP Segmentation
segmenting continuous EEG into epochs - re-reference - segmentation
5.1 parameters
# directory
= os.getcwd() + '/../preprocessed/3_ica/'
input_dir = os.getcwd() + '/../preprocessed/4_erp_epochs/' # for ERP
output_dir # create a folder if the folder doesn't exist
=True)
os.makedirs(output_dir, exist_ok
# epoch window:
= -0.2; erp_t_end = 0.8
erp_t_start = (-0.2, 0)
baseline
# criteria to reject epoch
# reject_criteria = dict(eeg = 100e-6) # 100 µV
# reject_criteria = dict(eeg = 150e-6) # 150 µV
= dict(eeg=200e-6) # 200 µV reject_criteria
# initialize a list for participants with too many bad trials
= []
too_many_bad_trial_participants
# get file names
= os.listdir(input_dir)
all_input = os.listdir(output_dir)
all_output
# re-reference, then epoch
for file in all_input:
if file.endswith("ica.fif") and (file.split('_')[2] + '_' + file.split('_')[1] + '_epo.fif' not in all_output):
# read in data
= mne.io.read_raw_fif(input_dir + file, preload = True)
raw
# average-mastoids re-reference
= ['TP9', 'TP10'])
raw.set_eeg_reference(ref_channels
#### this is for source calculation ####
# filter the data, optional
# raw = raw.filter(l_freq=None, h_freq=30)
# sphere = mne.make_sphere_model('auto', 'auto', raw.info)
# src = mne.setup_volume_source_space(sphere=sphere, exclude=30., pos=15.)
# forward = mne.make_forward_solution(raw.info, trans=None, src=src, bem=sphere)
# raw = raw.set_eeg_reference('REST', forward=forward)
########################################
# pick EEG channels
# picks = mne.pick_types(raw.info, eeg = True)
# get event info for segmentation
= mne.events_from_annotations(raw, verbose='WARNING')
events_from_annot, event_dict
# segmentation for ERP
= mne.Epochs(
epochs
raw,= events_from_annot, event_id = event_dict,
events = erp_t_start, tmax = erp_t_end,
tmin # apply baseline correction
= baseline,
baseline # remove epochs that meet the rejection criteria
= reject_criteria,
reject = True,
preload
)
##########################################################
#### remove 0-trial events, and log segmentation info ####
= file.split('_')[2] + '_' + file.split('_')[1]
ppt
for k, v in event_dict.items():
# good trial count
= len(epochs[k])
trial_count
# remove 0 trial event
if trial_count==0:
del epochs.event_id[k]
# good trial rate
= round( trial_count/sum(events_from_annot[:,2]==v), 2 )
goodTrial_rate
# record epoch summary
with open(output_dir + 'epoch_summary.txt', 'a+') as f:
=f.write(ppt + '\t' + k + '\t' + str(trial_count) + '\t' + str(goodTrial_rate) + '\n')
_
# mark a participant bad if any condition has fewer than 1/2 trials
if ( goodTrial_rate < 0.5 ):
# mark the participant file as bad
if ppt not in too_many_bad_trial_participants:
too_many_bad_trial_participants.append(ppt)##########################################################
# save single participant file
+ ppt + '_epo.fif', overwrite=True)
epochs.save(output_dir
# export the record of bad participants
with open(output_dir + 'too_many_bad_trial_participants.txt', 'w') as f:
for item in too_many_bad_trial_participants:
+ '\n') f.write(item
6 ERP average
6.1 parameters
# directory
= os.getcwd() + '/../preprocessed/4_erp_epochs/'
input_dir = os.getcwd() + '/../preprocessed/5_averaged/'
output_dir # create a folder if the folder doesn't exist
=True) os.makedirs(output_dir, exist_ok
#### get ERP ####
# get file names
= os.listdir(input_dir)
all_input = os.listdir(output_dir)
all_output
# initialize a dictionary to store data
= {}
all_evokeds
# for each file
for file in all_input:
if file.endswith("_epo.fif"):
# extract participant number
= file.split('_')[0] + '_' + file.split('_')[1]
ppt
# read in data
= mne.read_epochs(input_dir + file, preload = True)
epochs
# average | get ERP for each condition
= epochs.average(by_event_type=True)
evoked
# initialize dictionary for single-participant ERP
= {}
all_evokeds[ppt]
# add key for each condition for analysis
for cond in evoked:
# append the evoked data to the dictioncary of evoked data
= cond
all_evokeds[ppt][cond.comment]
# Saving the ERP data:
with open(output_dir + '/all_evokeds.pkl', 'wb') as f:
pickle.dump(all_evokeds, f)del all_evokeds