aa_mixer

Trying to map audio embeddings to vector spaces, for mixing.
install = False  # can set to false to skip this part, e.g. for re-running in same session
if install:     # ffmpeg is to add MP3 support to Colab
    !yes | sudo apt install ffmpeg 
    !pip install -Uqq einops gdown 
    !pip install -Uqq git+https://github.com/drscotthawley/aeiou
    !pip install -Uqq git+https://github.com/drscotthawley/audio-algebra

Basic setup of hardware environment

accelerator = accelerate.Accelerator()
hprint = HostPrinter(accelerator)  # this just prints only on interactive node
device = accelerator.device
#device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
#if torch.backends.mps.is_available():
#    os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

print("device = ",device)

Main parameters for the run/model

seed = 2

args_dict = {'num_quantizers':0, 'sample_size': 65536, 'sample_rate':48000, 'latent_dim': 64, 'pqmf_bands':1, 'ema_decay':0.995, 'num_quantizers':0}
#global_args = namedtuple("global_args", args_dict.keys())(*args_dict.values())
class DictObj:
    def __init__(self, in_dict:dict):
        assert isinstance(in_dict, dict), "in_dict is not a dict"
        for key, val in in_dict.items():
            if isinstance(val, (list, tuple)):
               setattr(self, key, [DictObj(x) if isinstance(x, dict) else x for x in val])
            else:
               setattr(self, key, DictObj(val) if isinstance(val, dict) else val)

global_args = DictObj(args_dict)

Set Up Data Loading

hprint("Setting up dataset")
args = global_args
args.training_dir =  f'{os.getenv("HOME")}/datasets/BDCT-0-chunk-48000'
args.num_workers = 2


args.batch_size = 256

load_frac = 0.1
torch.manual_seed(seed)
train_set = AudioDataset([args.training_dir], load_frac=load_frac)
train_dl = torchdata.DataLoader(train_set, args.batch_size, shuffle=True,
                num_workers=args.num_workers, persistent_workers=True, pin_memory=True)

# TODO: need to make val unique. for now just repeat train
val_set = AudioDataset([args.training_dir], load_frac=load_frac/4)
val_dl = torchdata.DataLoader(train_set, args.batch_size, shuffle=False,
                num_workers=args.num_workers, persistent_workers=True, pin_memory=True)

torch.manual_seed(seed)
val_iter = iter(val_dl)
train_iter = iter(train_dl)

print("len(train_set), len(val_set) =",len(train_set), len(val_set))

And let’s listen to a bit of audio

batch = next(val_iter) 
batch = next(val_iter)  # two nexts bc i don't like the first one
print("batch.shape = ",batch.shape)
playable_spectrogram(batch[0], output_type='live') # clear this output later if you want to keep .ipynb file size small

Set up the Given [Auto]Encoder Model(s)

Note that initially we’re only going to be using the encoder part. The decoder – with all of its sampling code, etc. – will be useful eventualy, and we’ go ahead and define it. But fyi it won’t be used at all while training the AA mixer model.

Download the checkpoint file for the dvae

on_colab = os.path.exists('/content')
if on_colab:
    from google.colab import drive
    drive.mount('/content/drive/') 
    ckpt_file = '/content/drive/MyDrive/AI/checkpoints/epoch=53-step=200000.ckpt'
else:
    ckpt_file = 'checkpoint.ckpt'
    if not os.path.exists(ckpt_file):
        url = 'https://drive.google.com/file/d/1C3NMdQlmOcArGt1KL7pH32KtXVCOfXKr/view?usp=sharing'
        # downloading large files from GDrive requires special treatment to bypass the dialog button it wants to throw up
        id = url.split('/')[-2]
        cmd = f'wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \'https://docs.google.com/uc?export=download&id={id}\' -O- | sed -rn \'s/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p\')&id={id}" -O {ckpt_file} && rm -rf /tmp/cookies.txt'
        print("cmd = \n",cmd)
        subprocess.run(cmd, shell=True, check=True)
given_model = DiffusionDVAE.load_from_checkpoint(ckpt_file, global_args=global_args)
given_model.eval() # disable randomness, dropout, etc...

# attach some arg values to the model 
given_model.demo_samples = global_args.sample_size 
given_model.quantized = global_args.num_quantizers > 0
given_model.to(device)
freeze(given_model)  # freeze the weights for inference
print("Given Autoencoder is ready to go!")

The AA-mixer model

Test that:

batch = next(train_iter) 
stems, faders, val_iter = get_stems_faders(batch, train_iter, train_dl, maxstems=2)
print("len(faders) = ",len(faders))

# artificially max out these stems! 
for i in range(len(faders)):
    faders[i] = 1/torch.abs(stems[i][0]).max() 

playable_spectrogram( stems[0][0]*faders[0], output_type='live')  #  this is just the batch
playable_spectrogram( stems[1][0]*faders[1], output_type='live')  # thisis something new

Mix and apply models

aa_use_bn = False  # batch norm? 
aa_use_resid = True # use residual connections? (doesn't make much difference tbh)
emb_dims = global_args.latent_dim # input size to aa model
hidden_dims = 64   # number of hidden dimensions in aa model. usually was 64
trivial = False  # aa_model is a no-op when this is true
debug = True 
print("emb_dims = ",emb_dims)

# untrained aa model
torch.manual_seed(seed+2)
#stems, faders, val_iter = get_stems_faders(batch, val_iter, val_dl)

aa_model = AudioAlgebra(dims=emb_dims, hidden_dims=hidden_dims, use_bn=aa_use_bn, resid=aa_use_resid, trivial=trivial).to(device) 
with torch.no_grad():
    zsum, zmix, archive = do_mixing(stems, faders, given_model, aa_model, device, debug=debug)
    
print("mix:")
playable_spectrogram( archive['mix'][0], output_type='live')

First, the effects of the given encoder \(f\)

def plot_emb_spectrograms(qs, labels, skip_ys=True):
    fig, ax = plt.subplots( 3 , 1, figsize=(10, 9))
    for i, (q, name) in enumerate(zip(qs, labels)):
        if i>2 and skip_ys: break
        row, col = i % 3, i//3
        im = tokens_spectrogram_image(q, mark_batches=True)
        newsize = (np.array(im.size) *800/im.size[0]).astype(int)
        im.resize(newsize)
        ax[row].imshow(im)
        ax[row].axis('off')
        ax[row].set_title(labels[i])

    plt.tight_layout()
    plt.show()
    
    
ys, ymix, ysum = archive['ys'], archive['ymix'], archive['ysum']
diff = ysum - ymix
qs      = [ ymix,   ysum,  diff, ys[0], ys[1]]
labels =  ['ymix', 'ysum','diff := ysum - ymix', 'y0', 'y1', ]
print("ymix.shape = ",ymix.shape)
plot_emb_spectrograms(qs, labels)

….So at least using the data I can see right now, ymix and ysum can differ by what looks to be 50% in places.

for i, (q, name) in enumerate(zip(qs, labels)):
    if i>2: break
    print(f"{name}:")
    show_pca_point_cloud(q, mode='lines+markers')

Now the z’s (note the model is untrained at this point)

Reconstruction /demo

Define Losses

Main run

Training loop

train_aa_model(debug=True)
if use_wandb: wandb.finish()