= False # can set to false to skip this part, e.g. for re-running in same session
install if install: # ffmpeg is to add MP3 support to Colab
!yes | sudo apt install ffmpeg
!pip install -Uqq einops gdown
!pip install -Uqq git+https://github.com/drscotthawley/aeiou
!pip install -Uqq git+https://github.com/drscotthawley/audio-algebra
aa_effects
Trying to map audio embeddings to vector spaces, for learning effects
Basic setup of hardware environment
= accelerate.Accelerator()
accelerator = HostPrinter(accelerator) # this just prints only on interactive node
hprint = accelerator.device
device #device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
#if torch.backends.mps.is_available():
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
print("device = ",device)
device = cuda
Main parameters for the run/model
= 2
seed
= {'num_quantizers':0, 'sample_size': 65536, 'sample_rate':48000, 'latent_dim': 64, 'pqmf_bands':1, 'ema_decay':0.995, 'num_quantizers':0}
args_dict #global_args = namedtuple("global_args", args_dict.keys())(*args_dict.values())
class DictObj:
def __init__(self, in_dict:dict):
assert isinstance(in_dict, dict), "in_dict is not a dict"
for key, val in in_dict.items():
if isinstance(val, (list, tuple)):
setattr(self, key, [DictObj(x) if isinstance(x, dict) else x for x in val])
else:
setattr(self, key, DictObj(val) if isinstance(val, dict) else val)
= DictObj(args_dict) global_args
Set Up Data Loading
"taskset -c -p 0-48 %d" % os.getpid()) os.system(
pid 1306483's current affinity list: 0,48
pid 1306483's new affinity list: 0-48
0
len(os.sched_getaffinity(0))
49
"Setting up dataset")
hprint(= global_args
args = f'{os.getenv("HOME")}/datasets/BDCT-0-chunk-48000'
args.training_dir = 48
args.num_workers
= 32
args.batch_size
= 0.01
load_frac
torch.manual_seed(seed)= DualEffectsDataset([args.training_dir], load_frac=load_frac)
train_set = torchdata.DataLoader(train_set, args.batch_size, shuffle=True,
train_dl =args.num_workers, persistent_workers=True, pin_memory=True)
num_workers
# TODO: need to make val unique. for now just repeat train
= DualEffectsDataset([args.training_dir], load_frac=load_frac/4, filenames=train_set.filenames) # reuse filenames for testing, speed
val_set = torchdata.DataLoader(train_set, args.batch_size, shuffle=False,
val_dl =args.num_workers, persistent_workers=True, pin_memory=True)
num_workers
torch.manual_seed(seed)= iter(val_dl)
val_iter = iter(train_dl)
train_iter
print("len(train_set), len(val_set) =",len(train_set), len(val_set))
Setting up dataset
augs = Stereo(), PhaseFlipper()
effects_list = ['Gain', 'BandPassFilter', 'BandStopFilter', 'HighPassFilter', 'LowPassFilter', 'PitchShift', 'Reverse', 'RoomSimulator', 'TanhDistortion']
AudioDataset:1841824 files found.
augs = Stereo(), PhaseFlipper()
effects_list = ['Gain', 'BandPassFilter', 'BandStopFilter', 'HighPassFilter', 'LowPassFilter', 'PitchShift', 'Reverse', 'RoomSimulator', 'TanhDistortion']
AudioDataset:18418 files found.
len(train_set), len(val_set) = 18418 46
And letβs listen to a bit of audio
= next(val_iter)
batch #batch
= next(val_iter)
batch print("batch.shape = ",batch["a"].shape)
'a'][0], output_type='live') # clear this output later if you want to keep .ipynb file size small playable_spectrogram(batch[
batch.shape = torch.Size([32, 2, 65536])
Set up the Given [Auto]Encoder Model(s)
Note that initially weβre only going to be using the encoder part. The decoder β with all of its sampling code, etc. β will be useful eventualy, and weβ go ahead and define it. But fyi it wonβt be used at all while training the AA mixer model.
Download the checkpoint file for the dvae
= os.path.exists('/content')
on_colab if on_colab:
from google.colab import drive
'/content/drive/')
drive.mount(= '/content/drive/MyDrive/AI/checkpoints/epoch=53-step=200000.ckpt'
ckpt_file else:
= '/fsx/shawley/checkpoints/dvae_checkpoint.ckpt'
ckpt_file if not os.path.exists(ckpt_file):
= 'https://drive.google.com/file/d/1C3NMdQlmOcArGt1KL7pH32KtXVCOfXKr/view?usp=sharing'
url # downloading large files from GDrive requires special treatment to bypass the dialog button it wants to throw up
id = url.split('/')[-2]
= f'wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \'https://docs.google.com/uc?export=download&id={id}\' -O- | sed -rn \'s/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p\')&id={id}" -O {ckpt_file} && rm -rf /tmp/cookies.txt'
cmd print("cmd = \n",cmd)
=True, check=True) subprocess.run(cmd, shell
= DiffusionDVAE.load_from_checkpoint(ckpt_file, global_args=global_args)
given_model eval() # disable randomness, dropout, etc...
given_model.
# attach some arg values to the model
= global_args.sample_size
given_model.demo_samples = global_args.num_quantizers > 0
given_model.quantized
given_model.to(device)# freeze the weights for inference
freeze(given_model) print("Given Autoencoder is ready to go!")
Given Autoencoder is ready to go!
The AA model
Mix and apply models
= False # batch norm?
aa_use_bn = True # use residual connections? (doesn't make much difference tbh)
aa_use_resid = global_args.latent_dim # input size to aa model
emb_dims = 64 # number of hidden dimensions in aa model. usually was 64
hidden_dims = False # aa_model is a no-op when this is true
trivial = True
debug print("emb_dims = ",emb_dims)
# untrained aa model
+2)
torch.manual_seed(seed#stems, faders, val_iter = get_stems_faders(batch, val_iter, val_dl)
= AudioAlgebra(dims=emb_dims, hidden_dims=hidden_dims, use_bn=aa_use_bn, resid=aa_use_resid, trivial=trivial).to(device)
aa_model with torch.no_grad():
= do_mixing(batch, given_model, aa_model, device) archive
emb_dims = 64
First, the effects of the given encoder \(f\)
def plot_emb_spectrograms(qs, labels, skip_ys=True):
= plt.subplots( 3 , 1, figsize=(10, 9))
fig, ax for i, (q, name) in enumerate(zip(qs, labels)):
if i>2 and skip_ys: break
= i % 3, i//3
row, col = tokens_spectrogram_image(q, mark_batches=True)
im = (np.array(im.size) *800/im.size[0]).astype(int)
newsize
im.resize(newsize)
ax[row].imshow(im)'off')
ax[row].axis(
ax[row].set_title(labels[i])
plt.tight_layout()
plt.show()
#ys, ymix, ysum = archive['ys'], archive['ymix'], archive['ysum']
#diff = ysum - ymix
#qs = [ ymix, ysum, diff, ys[0], ys[1]]
#labels = ['ymix', 'ysum','diff := ysum - ymix', 'y0', 'y1', ]
#print("ymix.shape = ",ymix.shape)
#plot_emb_spectrograms(qs, labels)
β¦.So at least using the data I can see right now, ymix and ysum can differ by what looks to be 50% in places.
#for i, (q, name) in enumerate(zip(qs, labels)):
# if i>2: break
# print(f"{name}:")
# show_pca_point_cloud(q, mode='lines+markers')
Now the zβs (note the model is untrained at this point)
Reconstruction /demo
Define Losses
Main run
Training loop
=True) train_aa_model(debug
total_steps = 23000
Setting up AA model using device: cuda
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: drscotthawley. Use `wandb login --relogin` to force relogin
wandb version 0.13.9 is available! To upgrade, please run:
$ pip install wandb --upgrade
Tracking run with wandb version 0.13.8
Run data is saved locally in
/fsx/shawley/code/audio-algebra/wandb/run-20230116_072622-lb0sa9bz
View project at https://wandb.ai/drscotthawley/aa-effects
Epoch 1/40: 100%|ββββββββββ| 576/576 [05:37<00:00, 1.71batch/s, loss=12.2]
Epoch 2/40: 31%|ββββ | 180/576 [01:49<03:39, 1.80batch/s, loss=11.7]
wandb.finish()