version: 1.0.0 model: base_learning_rate: 0.00001 target: mug.diffusion.diffusion.DDPM params: linear_start: 0.0001 linear_end: 0.02 log_every_t: 100 timesteps: 1000 z_channels: 16 z_length: 512 parameterization: eps loss_type: smooth_l1 monitor: val/loss_simple unet_config: target: mug.diffusion.unet.UNetModel params: in_channels: 16 model_channels: 128 out_channels: 16 attention_resolutions: [ 8,4,2 ] num_res_blocks: 2 channel_mult: [ 1,2,3,4 ] num_heads: 8 context_dim: 128 dropout: 0.0 lstm_last: false lstm_layer: false s4_layer: true audio_channels: [ 256,512,512,512 ] use_checkpoint: false first_stage_config: target: mug.firststage.autoencoder.AutoencoderKL params: monitor: "val/loss" kl_weight: 0.000001 ddconfig: x_channels: 16 # key_count * 4 middle_channels: 64 z_channels: 16 num_groups: 8 channel_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1 num_res_blocks: 1 lossconfig: target: torch.nn.Identity # target: mug.firststage.losses.ManiaReconstructLoss # params: # weight_start_offset: 0.5 # weight_holding: 0.5 # weight_end_offset: 0.2 # label_smoothing: 0.001 cond_stage_config: target: mug.cond.feature.BeatmapFeatureEmbedder params: path_to_yaml: "configs/mug/mania_beatmap_features.yaml" embed_dim: 128 wave_stage_config: target: mug.cond.wave.MelspectrogramScaleEncoder1D params: n_freq: 128 middle_channels: 128 attention_resolutions: [ 128,256,512 ] num_res_blocks: 2 num_heads: 8 num_groups: 32 dropout: 0.0 use_checkpoint: true channel_mult: [ 1,1,1,1,2,2,2,4,4,4 ] data: target: main.DataModuleFromConfig params: batch_size: 48 wrap: False # num_workers: 0 num_workers: 7 common_params: txt_file: [ ] sr: 22050 n_fft: 512 max_audio_frame: 32768 audio_note_window_ratio: 8 n_mels: 128 cache_dir: "data/audio_cache/" with_audio: true with_feature: true feature_yaml: "configs/mug/mania_beatmap_features.yaml" # audio_window_frame = n_fft / sr / 4 = 0.00580499 s # note_window_frame = audio_note_window_ratio * audio_window_frame = 0.04643990 s # max_duration = audio_window_frame * max_audio_frame = 190.2179 s = 3 min 10 s # max_note_frame = max_audio_frame / audio_note_window_ratio = 4096 # old =========== # audio_window_frame = n_fft / sr / 4 = 0.02321995 s # note_window_frame = audio_note_window_ratio * audio_window_frame = 0.04643990 s # max_duration = audio_window_frame * max_audio_frame = 380.4357 s = 6 min 20 s # max_note_frame = max_audio_frame / audio_note_window_ratio = 8192 train: target: mug.data.dataset.OsuTrainDataset params: mirror_p: 0.5 feature_dropout_p: 0.5 mirror_at_interval_p: 0 rate_p: 0.2 rate: [ 0.75,1.3 ] freq_mask_p: 0.0 freq_mask_num: 15 validation: target: mug.data.dataset.OsuValidDataset params: {} # test_txt_file: "data\\mug\\local_mania_4k_test.txt" lightning: callbacks: beatmap_logger: target: mug.data.dataset.BeatmapLogger params: log_batch_idx: [ 0 ] splits: [ 'val' ] count: 16 trainer: benchmark: True accelerator: dp accumulate_grad_batches: 1 # precision: 16