Source code for nncore.nn.blocks.transformer

# Copyright (c) Ye Liu. Licensed under the MIT License.

from math import log

import torch
import torch.nn as nn

import nncore
from ..builder import MODELS, build_act_layer, build_norm_layer
from ..bundle import Parameter, Sequential
from ..init import kaiming_init_, xavier_init_



[docs]
@MODELS.register()
@nncore.bind_getter('dims', 'learnable', 'p', 'max_len')
class PositionalEncoding(nn.Module):
    """
    Positional Encoding introduced in [1].

    Args:
        dims (int): The input feature dimensions.
        learnable (bool, optional): Whether the positional encoding is
            learnable. Default: ``True``.
        p (float, optional): The dropout probability. Default: ``0.1``.
        max_len (int, optional): The maximum length of the input sequence.
            Default: ``5000``.

    References:
        1. Vaswani et al. (https://arxiv.org/abs/1706.03762)
    """

    def __init__(self, dims, learnable=True, p=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()

        self._dims = dims
        self._learnable = learnable
        self._p = p
        self._max_len = max_len

        if learnable:
            self.pe = Parameter(1, max_len, dims)
        else:
            pos = torch.arange(max_len).unsqueeze(1)
            div = (torch.arange(0, dims, 2) * (-log(10000.0) / dims)).exp()
            pe = torch.zeros(1, max_len, dims)
            pe[0, :, 0::2] = (pos * div).sin()
            pe[0, :, 1::2] = (pos * div).cos()
            self.register_buffer('pe', pe)

        self.dropout = nn.Dropout(p=p)

    def __repr__(self):
        return ('{}(dims={}, learnable={}, p={}, max_len={})'.format(
            self.__class__.__name__, self._dims, self._learnable, self._p,
            self._max_len))

    def forward(self, x):
        pe = self.pe[:, :x.size(1)].repeat(x.size(0), 1, 1)
        pe = self.dropout(pe)
        return pe




[docs]
@MODELS.register()
@nncore.bind_getter('dims', 'q_dims', 'k_dims', 'v_dims', 'h_dims', 'o_dims',
                    'heads', 'p', 'bias')
class MultiHeadAttention(nn.Module):
    """
    Multi-Head Attention introduced in [1].

    Args:
        dims (int): The input feature dimensions.
        k_dims (int | None, optional): The dimensions of key matrix. If not
            specified, it will be the same as ``q_dims``. Default: ``None``.
        v_dims (int | None, optional): The dimensions of value matrix. If not
            specified, it will be the same as ``q_dims``. Default: ``None``.
        h_dims (int | None, optional): The hidden dimensions. If not specified,
            it will be the same as ``q_dims``. Default: ``None``.
        o_dims (int | None, optional): The output dimensions. If not specified,
            it will be the same as ``q_dims``. Default: ``None``.
        heads (int, optional): The number of attention heads. Default: ``8``.
        p (float, optional): The dropout probability. Default: ``0.1``.
        bias (bool, optional): Whether to add the bias term. Default: ``True``.

    References:
        1. Vaswani et al. (https://arxiv.org/abs/1706.03762)
    """

    def __init__(self,
                 dims,
                 k_dims=None,
                 v_dims=None,
                 h_dims=None,
                 o_dims=None,
                 heads=8,
                 p=0.1,
                 bias=True):
        super(MultiHeadAttention, self).__init__()

        self._q_dims = dims
        self._k_dims = k_dims or dims
        self._v_dims = v_dims or dims
        self._h_dims = h_dims or dims
        self._o_dims = o_dims or dims
        self._heads = heads
        self._p = p
        self._bias = bias
        self._head_dims = self._h_dims // heads

        self.q = nn.Linear(self._q_dims, self._h_dims, bias=bias)
        self.k = nn.Linear(self._k_dims, self._h_dims, bias=bias)
        self.v = nn.Linear(self._v_dims, self._h_dims, bias=bias)
        self.m = nn.Linear(self._h_dims, self._o_dims, bias=bias)

        self.dropout = nn.Dropout(p=p)

        self.reset_parameters()

    def __repr__(self):
        return ('{}(q_dims={}, k_dims={}, v_dims={}, h_dims={}, o_dims={}, '
                'heads={}, p={}, bias={})'.format(self.__class__.__name__,
                                                  self._q_dims, self._k_dims,
                                                  self._v_dims, self._h_dims,
                                                  self._o_dims, self._heads,
                                                  self._p, self._bias))

    def reset_parameters(self):
        for m in (self.q, self.k, self.v, self.m):
            xavier_init_(m)

    def forward(self, q, k=None, v=None, mask=None):
        v = v if torch.is_tensor(v) else k if torch.is_tensor(k) else q
        k = k if torch.is_tensor(k) else q

        q = self.q(q).transpose(0, 1).contiguous()
        k = self.k(k).transpose(0, 1).contiguous()
        v = self.v(v).transpose(0, 1).contiguous()

        b = q.size(1) * self._heads

        q = q.view(-1, b, self._head_dims).transpose(0, 1)
        k = k.view(-1, b, self._head_dims).transpose(0, 1)
        v = v.view(-1, b, self._head_dims).transpose(0, 1)

        att = torch.bmm(q, k.transpose(1, 2)) / self._head_dims**0.5

        if mask is not None:
            mask = torch.where(mask > 0, .0, float('-inf'))
            mask = mask.repeat_interleave(self._heads, dim=0)
            att += mask.unsqueeze(1)

        att = att.softmax(-1)

        if self.dropout is not None:
            att = self.dropout(att)

        m = torch.bmm(att, v).transpose(0, 1).contiguous()
        m = m.view(m.size(0), -1, self._h_dims).transpose(0, 1)
        m = self.m(m)

        if self.dropout is not None:
            m = self.dropout(m)

        return m




[docs]
@MODELS.register()
@nncore.bind_getter('dims', 'ratio', 'p')
class FeedForwardNetwork(nn.Module):
    """
    Feed Forward Network introduced in [1].

    Args:
        dims (int): The input feature dimensions.
        ratio (float, optional): The ratio of hidden layer dimensions with
            respect to the input dimensions. Default: ``4``.
        p (float, optional): The dropout probability. Default: ``0.1``.
        act_cfg (dict | str | None, optional): The config or name of the
            activation layer. Default: ``dict(type='ReLU', inplace=True)``.

    References:
        1. Vaswani et al. (https://arxiv.org/abs/1706.03762)
    """

    def __init__(self,
                 dims,
                 ratio=4,
                 p=0.1,
                 act_cfg=dict(type='ReLU', inplace=True)):
        super(FeedForwardNetwork, self).__init__()

        self._dims = dims
        self._ratio = ratio
        self._p = p
        self._h_dims = int(dims * ratio)

        self.mapping = Sequential(
            nn.Linear(dims, self._h_dims), build_act_layer(act_cfg),
            nn.Dropout(p=p), nn.Linear(self._h_dims, dims), nn.Dropout(p=p))

        self.reset_parameters()

    def __repr__(self):
        return '{}(dims={}, ratio={}, p={})'.format(self.__class__.__name__,
                                                    self._dims, self._ratio,
                                                    self._p)

    def reset_parameters(self):
        for m in self.mapping:
            if isinstance(m, nn.Linear):
                kaiming_init_(m)

    def forward(self, x):
        x = self.mapping(x)
        return x




[docs]
@MODELS.register()
@nncore.bind_getter('dims', 'heads', 'ratio', 'p', 'pre_norm')
class TransformerEncoderLayer(nn.Module):
    """
    Transformer Encoder Layer introduced in [1].

    Args:
        dims (int): The input feature dimensions.
        heads (int, optional): The number of attention heads. Default: ``8``.
        ratio (float, optional): The ratio of hidden layer dimensions in the
            feed forward network. Default: ``4``.
        p (float, optional): The dropout probability. Default: ``0.1``.
        pre_norm (bool, optional): Whether to apply the normalization before
            instead of after each layer. Default: ``True``.
        norm_cfg (dict | str | None, optional): The config or name of the
            normalization layer. Default: ``dict(type='LN')``.
        act_cfg (dict | str | None, optional): The config or name of the
            activation layer. Default: ``dict(type='ReLU', inplace=True)``.

    References:
        1. Vaswani et al. (https://arxiv.org/abs/1706.03762)
    """

    def __init__(self,
                 dims,
                 heads=8,
                 ratio=4,
                 p=0.1,
                 pre_norm=True,
                 norm_cfg=dict(type='LN'),
                 act_cfg=dict(type='ReLU', inplace=True)):
        super(TransformerEncoderLayer, self).__init__()

        self._dims = dims
        self._heads = heads
        self._ratio = ratio
        self._p = p
        self._pre_norm = pre_norm

        self.att = MultiHeadAttention(dims, heads=heads, p=p)
        self.ffn = FeedForwardNetwork(dims, ratio=ratio, p=p, act_cfg=act_cfg)

        self.norm1 = build_norm_layer(norm_cfg, dims=dims)
        self.norm2 = build_norm_layer(norm_cfg, dims=dims)

    def forward(self, x, pe=None, mask=None):
        if self._pre_norm:
            v = self.norm1(x)
            q = k = v if pe is None else v + pe
            d = self.att(q, k, v, mask=mask)
            x = x + d

            d = self.norm2(x)
            d = self.ffn(d)
            x = x + d
        else:
            q = k = x if pe is None else x + pe
            d = self.att(q, k, x, mask=mask)
            x = self.norm1(x + d)

            d = self.ffn(x)
            x = self.norm2(x + d)

        return x




[docs]
@MODELS.register()
@nncore.bind_getter('dims', 'heads', 'ratio', 'p', 'pre_norm')
class TransformerDecoderLayer(nn.Module):
    """
    Transformer Decoder Layer introduced in [1].

    Args:
        dims (int): The input feature dimensions.
        heads (int, optional): The number of attention heads. Default: ``8``.
        ratio (int, optional): The ratio of hidden layer dimensions in the
            feed forward network. Default: ``4``.
        p (float, optional): The dropout probability. Default: ``0.1``.
        pre_norm (bool, optional): Whether to apply the normalization before
            instead of after each layer. Default: ``True``.
        norm_cfg (dict | str | None, optional): The config or name of the
            normalization layer. Default: ``dict(type='LN')``.
        act_cfg (dict | str | None, optional): The config or name of the
            activation layer. Default: ``dict(type='ReLU', inplace=True)``.

    References:
        1. Vaswani et al. (https://arxiv.org/abs/1706.03762)
    """

    def __init__(self,
                 dims,
                 heads=8,
                 ratio=4,
                 p=0.1,
                 pre_norm=True,
                 norm_cfg=dict(type='LN'),
                 act_cfg=dict(type='ReLU', inplace=True)):
        super(TransformerDecoderLayer, self).__init__()

        self._dims = dims
        self._heads = heads
        self._ratio = ratio
        self._p = p
        self._pre_norm = pre_norm

        self.att1 = MultiHeadAttention(dims, heads=heads, p=p)
        self.att2 = MultiHeadAttention(dims, heads=heads, p=p)
        self.ffn = FeedForwardNetwork(dims, ratio=ratio, p=p, act_cfg=act_cfg)

        self.norm1 = build_norm_layer(norm_cfg, dims=dims)
        self.norm2 = build_norm_layer(norm_cfg, dims=dims)
        self.norm3 = build_norm_layer(norm_cfg, dims=dims)

    def forward(self, x, mem, q_pe=None, k_pe=None, q_mask=None, k_mask=None):
        if self._pre_norm:
            v = self.norm1(x)
            q = k = v if q_pe is None else v + q_pe
            d = self.att1(q, k, v, mask=q_mask)
            x = x + d

            q = self.norm2(x)
            q = q if q_pe is None else q + q_pe
            k = mem if k_pe is None else mem + k_pe
            d = self.att2(q, k, mem, mask=k_mask)
            x = x + d

            d = self.norm3(x)
            d = self.ffn(d)
            x = x + d
        else:
            q = k = x if q_pe is None else x + q_pe
            d = self.att1(q, k, x, mask=q_mask)
            x = self.norm1(x + d)

            q = x if q_pe is None else x + q_pe
            k = mem if k_pe is None else mem + k_pe
            d = self.att2(q, k, mem, mask=k_mask)
            x = self.norm2(x + d)

            d = self.ffn(x)
            x = self.norm3(x + d)

        return x




[docs]
@MODELS.register()
@nncore.bind_getter('dims', 'heads', 'ratio', 'p', 'pre_norm')
class CrossAttentionLayer(nn.Module):
    """
    Cross Attention Layer.

    Args:
        dims (int): The input feature dimensions.
        heads (int, optional): The number of attention heads. Default: ``8``.
        ratio (int, optional): The ratio of hidden layer dimensions in the
            feed forward network. Default: ``4``.
        p (float, optional): The dropout probability. Default: ``0.1``.
        pre_norm (bool, optional): Whether to apply the normalization before
            instead of after each layer. Default: ``True``.
        norm_cfg (dict | str | None, optional): The config or name of the
            normalization layer. Default: ``dict(type='LN')``.
        act_cfg (dict | str | None, optional): The config or name of the
            activation layer. Default: ``dict(type='ReLU', inplace=True)``.
    """

    def __init__(self,
                 dims,
                 heads=8,
                 ratio=4,
                 p=0.1,
                 pre_norm=True,
                 norm_cfg=dict(type='LN'),
                 act_cfg=dict(type='ReLU', inplace=True)):
        super(CrossAttentionLayer, self).__init__()

        self._dims = dims
        self._heads = heads
        self._ratio = ratio
        self._p = p
        self._pre_norm = pre_norm

        self.att1 = MultiHeadAttention(dims, heads=heads, p=p)
        self.att2 = MultiHeadAttention(dims, heads=heads, p=p)
        self.ffn1 = FeedForwardNetwork(dims, ratio=ratio, p=p, act_cfg=act_cfg)
        self.ffn2 = FeedForwardNetwork(dims, ratio=ratio, p=p, act_cfg=act_cfg)

        self.norm1 = build_norm_layer(norm_cfg, dims=dims)
        self.norm2 = build_norm_layer(norm_cfg, dims=dims)
        self.norm3 = build_norm_layer(norm_cfg, dims=dims)
        self.norm4 = build_norm_layer(norm_cfg, dims=dims)

    def forward(self, a, b, a_mask=None, b_mask=None):
        _a, _b = a, b

        if self._pre_norm:
            q = self.norm1(a)
            d = self.att1(q, _b, _b, mask=b_mask)
            a = a + d

            q = self.norm2(b)
            d = self.att2(q, _a, _a, mask=a_mask)
            b = b + d

            d = self.norm3(a)
            d = self.ffn1(d)
            a = a + d

            d = self.norm4(b)
            d = self.ffn2(d)
            b = b + d
        else:
            d = self.att1(a, _b, _b, mask=b_mask)
            a = self.norm1(a + d)

            d = self.att2(b, _a, _a, mask=a_mask)
            b = self.norm2(b + d)

            d = self.ffn1(a)
            a = self.norm3(a + d)

            d = self.ffn2(b)
            b = self.norm4(b + d)

        return a, b