HiFi-GAN

HiFi-GAN：有效的、从 mel-spectrogram 生成高质量的 raw waveforms 模型。主要考虑了“语音信号是由不同周期的正弦组成”，在 GAN 模型的 generator 和 discriminator 分别利用了这一点，对音频的周期模式进行建模，从而提高了合成质量。

Paper

https://arxiv.org/abs/2010.05646

genertor框架图如下：

discriminator框架图如下：

代码

https://github.com/jik876/hifi-gan 官方代码

生成器 Genertor

在详细解释 genertor 之前先介绍转置卷积及膨胀卷积这两个衍生的卷积操作。

转置卷积(Transpose Convolution)在语义分割或者对抗神经网络（GAN）中比较常见，其主要作用就是做上采样（UpSampling）。详细介绍可参考大佬讲解视频李沐–47 转置卷积【动手学深度学习v2】

转置卷积维度变化公式为：
out_dim = (in_dim - 1) * stride - 2 * padding + kernel_size；这个公式正好与卷积相反

膨胀卷积
在不引入额外参数的前提下可以任意扩大感受野，同时保持特征图的分辨率不变。注意设置合理的膨胀系数避免出现 Gridding Effect。

膨胀卷积维度变化公式为：
out_dim = (in_dim + 2 * padding - (kernel_size - 1)*dilation - 1) / stride + 1

当想要维持输入输出维度不变时，即same padding，padding的参数计算为：

1 2	def get_padding(kernel_size, dilation=1): return int((kernel_size*dilation - dilation)/2)

genertor

生成器全部由卷积构成，主要分为如下几个部分：

conv_pre：1维卷积，将 80 维的 mel 谱图映射到 512 维；

1
2

conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
# Conv1d 中卷积核大小为 7，这里第二个维度的大小是由 in_channels 决定的，其实卷积核大小为 [7, 80]

ups：上采样模块，由转置卷积 ConvTranspose1d 组成，上采样分为4个卷积：卷积核大小 16， 16， 4， 4；Stride大小（同时也是上采样率） 8， 8， 2， 2；channels 变化 512–>256–>128–>64–>32。最后会得到将[512, T]维度的特征映射到[32, T*256]维度的特征；这里放大256倍正好符合当时mel谱生成过程中hop_size=256！
resblocks：论文中提到的MRF(Multi-Receptive Field Fusion)，利用膨胀卷积中不同卷积核大小和不同的膨胀率来生成不同感受野的输出。resblocks共有3个resblock，卷积核大小分别是[3, 7, 11]，膨胀率分别是[[1, 3, 5], [1, 3, 5], [1, 3, 5]]。具体过程如下：

# 1. 每个 ups 上采样模块之后都接了一个 resblocks;
# 2. resblocks共有3个resblock
# 3. resblock这样组成：由膨胀卷积`convs1`和没有膨胀的卷积`convs2`组成，forward过程依次叠加，leaky_relu作用与叠加过程。

# 代码如下：
class ResBlock1(torch.nn.Module):
    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
        super(ResBlock1, self).__init__()
        self.h = h
        self.convs1 = nn.ModuleList([
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
                               padding=get_padding(kernel_size, dilation[0]))),
            # 膨胀卷积，参数量不变的基础上，可以增大感受野
            # 膨胀卷积 channels 计算
            # out_channels = (in_channels - (dilation * (kernel_size - 1) + 1) + 2 * padding) / stride + 1
            # get_padding() 是获得 same 的 padding
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
                               padding=get_padding(kernel_size, dilation[1]))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
                               padding=get_padding(kernel_size, dilation[2])))
        ])
        self.convs1.apply(init_weights)

        self.convs2 = nn.ModuleList([
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
                               padding=get_padding(kernel_size, 1))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
                               padding=get_padding(kernel_size, 1))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
                               padding=get_padding(kernel_size, 1)))
        ])
        self.convs2.apply(init_weights)

    def forward(self, x):
        for c1, c2 in zip(self.convs1, self.convs2):
            xt = F.leaky_relu(x, LRELU_SLOPE)
            xt = c1(xt)
            xt = F.leaky_relu(xt, LRELU_SLOPE)
            xt = c2(xt)
            x = xt + x
        return x

    def remove_weight_norm(self):
        for l in self.convs1:
            remove_weight_norm(l)
        for l in self.convs2:
            remove_weight_norm(l)

conv_post：将 32 维特征映射到 1 维度
tanh：将值映射到【-1， 1】，正好对应音频点【32768，32767】

整个生成器genertor详细代码如下：

class Generator(torch.nn.Module):
    def __init__(self, h):
        super(Generator, self).__init__()
        self.h = h                                                                             # config_v1_json
        self.num_kernels = len(h.resblock_kernel_sizes)                                        # len([3,7,11])
        self.num_upsamples = len(h.upsample_rates)                                             # len([8,8,2,2])
        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))   # upsample_initial_channel = 512
        # Conv1d 中卷积核大小为 7，这里第二个维度的大小是由 in_channels 决定的，其实卷积核大小为 【7. 80】
        resblock = ResBlock1 if h.resblock == '1' else ResBlock2                               # resblock = 1

        self.ups = nn.ModuleList()
        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):            # upsample_kernel_sizes = [16,16,4,4]
            self.ups.append(weight_norm(
                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
                                k, u, padding=(k-u)//2)))
            # 这里初始化网络的时候用到了 weight_norm，可以对优化学习过程。详细看论文 Weight Normalization: A Simple ... ...
            # 1维反卷积 ConvTransposed1d
            # k : kernel_size  16  16  4  4
            # u : stride       8   8   2  2

            # 220 * 8 = 8 *（220-1）-2*4 + 16 （这个是反卷积计算output_size的公式）
            # 详细反卷积介绍 https://www.cnblogs.com/cvtoEyes/p/8513958.html

        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = h.upsample_initial_channel//(2**(i+1))
            # ch_size : 256 ---> 128 ---> 64 ---> 32
            # "resblock_kernel_sizes": [3, 7, 11],
            # "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
                self.resblocks.append(resblock(h, ch, k, d))

        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
        self.ups.apply(init_weights)
        self.conv_post.apply(init_weights)

    def forward(self, x):
        # shape: [1, 80, 220]
        x = self.conv_pre(x)
        # shape: [1, 512, 220]
        for i in range(self.num_upsamples):
            x = F.leaky_relu(x, LRELU_SLOPE)
            # leaky_relu，当x为负数时，激活后不是为0，而是一个斜率比较小的负数（线性的，这里LRELU_SLOPE=0.1，那应该就是y=-0.1x）
            x = self.ups[i](x)
            # 这里经历了 4 个上采样的过程，分别是
            # [1, 512, 220] ---> [1, 256, 220*8] ---> [1, 128, 220*8*8] ---> [1, 64, 220*8*8*2] ----> [1, 32, 220*256]
            # 最后上采样是将1个点变成了256个点，正好对应在变为mel谱过程中的hop_size=256!
            # print(x.shape)
            xs = None
            for j in range(self.num_kernels):
                if xs is None:
                    xs = self.resblocks[i*self.num_kernels+j](x)
                else:
                    xs += self.resblocks[i*self.num_kernels+j](x)
            x = xs / self.num_kernels
        x = F.leaky_relu(x)
        # print(x.shape)
        # [1, 32, 220 * 256]
        x = self.conv_post(x)
        # [1, 1, 220 * 256]
        x = torch.tanh(x)
        # 最后将结果利用 tanh 映射到【-1，1】，正好对应音频点 -32768 -- 32767
        return x

    def remove_weight_norm(self):
        print('Removing weight norm...')
        for l in self.ups:
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()
        remove_weight_norm(self.conv_pre)
        remove_weight_norm(self.conv_post)

生成器损失函数

L1 distance 的 Mel-Spectrogram Loss, 权重系数为 45
L1 distance 的 Feature Matching Loss（判别器特征的对比，MPD和MSD），权重系数为 2
原有 LS-GAN 的损失，好多个子鉴别器损失求和

判别器 Discriminator

multi-period discriminator (MPD)

由于语音信号很长，100ms就要有2200个采样点，高采样率的更多，所以识别长期依赖关系是模拟真实语音的关键。MPD主要的出发点是由于语音音频由不同周期的正弦信号组成，需要识别音频数据中的不同周期模式。MPD共包含五个子鉴别器（主要就是periods 参数不一样），periods 为[2, 3, 5, 7, 11]。

MPD单个子鉴别器构建的过程如下（上文中的图可以很好的表示清楚）：

按照period将原始1D波形变为2D数据；
Conv2d，增大channels，降低 t(时间维度)
fmap：记录各个卷积输出特征，用于后续训练生成器

维度变化为：

segment_size = 8192  # 表示仅仅利用 wav 中 8192 个点
# torch.Size([8, 32, 1366, 2])
# torch.Size([8, 128, 456, 2])
# torch.Size([8, 512, 152, 2])
# torch.Size([8, 1024, 51, 2])
# torch.Size([8, 1024, 51, 2])

单一MPD代码如下：

class DiscriminatorP(torch.nn.Module):
    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
        super(DiscriminatorP, self).__init__()
        self.period = period
        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
        # spectral_norm 这个是使 GAN 收敛的一种优化方法
        # (kernel_size, 1)： width 轴上的为1，表示独立处理每个 period
        # (stride, 1): width 轴上的为1，表示不能跳过任何一个 period
        self.convs = nn.ModuleList([
            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
        ])
        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))

    def forward(self, x):
        fmap = []
        # 1d to 2d
        # print(x.shape)
        # [16, 1, 8192]
        # segment_size = 8192， 这个是把 wav 文件的前 8192 个点取出来检测（确定这么点就可以了吗？）
        b, c, t = x.shape
        # b = 16         # batch_size
        # c = 1          # channels
        # t = 8192       # times

        if t % self.period != 0: # pad first
            n_pad = self.period - (t % self.period)
            x = F.pad(x, (0, n_pad), "reflect")
            # reflect 表示镜像填充
            # (0, n_pad)：这里表示 填充 t 的左右维度，0表示不填充左边，n_pad表示右边填充n_pad（以最后一列为轴，进行镜像填充）
            t = t + n_pad
        # 这里是按照 period 将 一维的数据 转换为二维的数据
        x = x.view(b, c, t // self.period, self.period)
        # 【16， 1， 4096， 2】
        # print(x.shape)

        for l in self.convs:
            x = l(x)
            # print(x.shape)
            x = F.leaky_relu(x, LRELU_SLOPE)
            fmap.append(x)
            # torch.Size([8, 32, 1366, 2])
            # torch.Size([8, 128, 456, 2])
            # torch.Size([8, 512, 152, 2])
            # torch.Size([8, 1024, 51, 2])
            # torch.Size([8, 1024, 51, 2])
        x = self.conv_post(x)
        # torch.Size([8, 1, 51, 2])
        fmap.append(x)
        x = torch.flatten(x, 1, -1)
        # torch.Size([8, 102])
        return x, fmap

整个判别器MDP代码如下：

class MultiPeriodDiscriminator(torch.nn.Module):
    def __init__(self):
        super(MultiPeriodDiscriminator, self).__init__()
        self.discriminators = nn.ModuleList([
            DiscriminatorP(2),
            DiscriminatorP(3),
            DiscriminatorP(5),
            DiscriminatorP(7),
            DiscriminatorP(11),
        ])

    def forward(self, y, y_hat):
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        for i, d in enumerate(self.discriminators):
            y_d_r, fmap_r = d(y)
            y_d_g, fmap_g = d(y_hat)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)

        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

判别器损失

$\mathcal{L}_{A d v}(D ; G)=\mathbb{E}_{(x, s)}\left[(D(x)-1)^2+(D(G(s)))^2\right]$

$\mathcal{L}_{A d v}(G ; D)=\mathbb{E}_s\left[(D(G(s))-1)^2\right]$

这里损失函数并不是原始GAN论文里的交叉熵损失，而是利用了LS-GAN中的优化函数，求的是最小二乘损失。直观理解，真实值为1，生成值为0。在固定G优化D的过程中，使得 $\mathcal{L}_{A d v}(D ; G)$ 越小越好，即能更好的区分生成和真实；在固定D优化G的过程中，使得 $\mathcal{L}_{A d v}(G ; D)$ 越小越好，即生成的越来越接近1。

鉴别器损失函数计算如下：

def discriminator_loss(disc_real_outputs, disc_generated_outputs):
    '''
    LS-GAN
    Xudong Mao, Qing Li, Haoran Xie, Raymond YK Lau, ZhenWang, and Stephen Paul Smolley.
    Least squares generative adversarial networks.
    In Proceedings of the IEEE International Conference on Computer Vision, pages 2794–2802, 2017.

    The discriminator is trained to classify ground truth samples to 1, and the samples synthesized from
    the generator to 0. The generator is trained to fake the discriminator by updating the sample quality
    to be classified to a value almost equal to 1.
    '''

    loss = 0
    r_losses = []
    g_losses = []
    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
        r_loss = torch.mean((1-dr)**2)
        g_loss = torch.mean(dg**2)
        # 这里计算损失是利用了 LS-GAN，而不是原始 GAN 的论文
        # 将 real 认为是 1
        # 将 合成的 认为是 0，这样来优化判别器
        loss += (r_loss + g_loss)
        r_losses.append(r_loss.item())
        g_losses.append(g_loss.item())

    return loss, r_losses, g_losses

multi-scale discriminator (MSD)

由于MPD鉴别的都是不连续的采样点，故引入MSD来进行辅助，MSD主要是MelGAN提出来的。MSD is a mixture of three sub-discriminators operating on different input scales: raw audio, *2 average-pooled audio, and *4 average-pooled audio.

这里需要介绍 Grouped Convolutions

groups controls the connections between inputs and outputs. in_channels and out_channels must both be divisible by groups. For example,

At groups=1, all inputs are convolved to all outputs.
At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels and producing half the output channels, and both subsequently concatenated.

单一MSD为

class DiscriminatorS(torch.nn.Module):
    def __init__(self, use_spectral_norm=False):
        super(DiscriminatorS, self).__init__()
        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
        # Conv1d 中 groups 的含义 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        # 当groups = 1 的时候，假设此时 输入的通道数为n，输出的通道数为m，那么理解为把输入的通道分成1组(不分组)，
        # 每一个输出通道需要在所有的输入通道上做卷积，也就是一种参数共享的局部全连接。
        # 如果把groups改成2，可以理解为把 输入的通道分成两组，此时每一个输出通道只需要在其中一组上做卷积。
        # 如果groups = in_channels，也就是把输入的通道分成in_channels组(每一组也就一个通道)，
        # 此时每一个输出通道只需要在其中一个输入通道上做卷积。

        self.convs = nn.ModuleList([
            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
        ])
        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))

    def forward(self, x):
        fmap = []
        for l in self.convs:
            x = l(x)
            # print(x.shape)
            # torch.Size([8, 128, 8192])
            # torch.Size([8, 128, 4096])
            # torch.Size([8, 256, 2048])
            # torch.Size([8, 512, 512])
            # torch.Size([8, 1024, 128])
            # torch.Size([8, 1024, 128])
            # torch.Size([8, 1024, 128]
            x = F.leaky_relu(x, LRELU_SLOPE)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = torch.flatten(x, 1, -1)

        return x, fmap

整个MSD代码如下：

class MultiScaleDiscriminator(torch.nn.Module):
    def __init__(self):
        super(MultiScaleDiscriminator, self).__init__()
        self.discriminators = nn.ModuleList([
            DiscriminatorS(use_spectral_norm=True),
            DiscriminatorS(),
            DiscriminatorS(),
        ])
        self.meanpools = nn.ModuleList([
            AvgPool1d(4, 2, padding=2),
            AvgPool1d(4, 2, padding=2)
        ])

    def forward(self, y, y_hat):
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        # print(y.shape)  [8, 1, 8192]
        for i, d in enumerate(self.discriminators):
            if i != 0:
                y = self.meanpools[i-1](y)
                print(y.shape)
                y_hat = self.meanpools[i-1](y_hat)
            y_d_r, fmap_r = d(y)
            y_d_g, fmap_g = d(y_hat)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)
        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

Fine-Tuning

待续