Seq2Seq / ゼロから作る Deep Learning 2

    Posted on 2019/01/26

    7章: RNNによる文章生成

    ゼロから作るDeep Learning (2)の読書メモです。6章ではゲートと呼ばれる仕組みを導入することで長期的な依存関係を学習できる LSTM の実装について学びました。7章では前章で実装した言語モデルを利用して RNN による文章生成を行い、時系列データを別の時系列データに変換できる Seq2Seq と呼ばれる手法をみていきます。

    参考実装

    %sh
    rm -rf /tmp/deep-learning-from-scratch-2
    git clone https://github.com/oreilly-japan/deep-learning-from-scratch-2 /tmp/deep-learning-from-scratch-2
    Cloning into '/tmp/deep-learning-from-scratch-2'...
    

    必要なモジュールを入れる

    %sh
    pip3 install numpy matplotlib
    Requirement already satisfied: numpy in /usr/lib64/python3.6/dist-packages
    Requirement already satisfied: matplotlib in /usr/lib64/python3.6/dist-packages
    Requirement already satisfied: pytz in /usr/lib/python3.6/dist-packages (from matplotlib)
    Requirement already satisfied: six>=1.10 in /usr/lib/python3.6/dist-packages (from matplotlib)
    Requirement already satisfied: cycler>=0.10 in /usr/lib/python3.6/dist-packages (from matplotlib)
    Requirement already satisfied: kiwisolver>=1.0.1 in /usr/lib64/python3.6/dist-packages (from matplotlib)
    Requirement already satisfied: python-dateutil>=2.1 in /usr/lib/python3.6/dist-packages (from matplotlib)
    Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/lib/python3.6/dist-packages (from matplotlib)
    Requirement already satisfied: setuptools in /usr/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib)
    You are using pip version 9.0.3, however version 19.0.1 is available.
    You should consider upgrading via the 'pip install --upgrade pip' command.
    

    7.1: 言語モデルを使った文章生成

    • 前章で実装した言語モデルを使って文章を生成する

    %python3
    import sys
    sys.path.append('/tmp/deep-learning-from-scratch-2')

    LSTMレイヤ

    %python3
    from common.functions import sigmoid
    
    
    class LSTM:
        def __init__(self, Wx, Wh, b):
            self.params = [Wx, Wh, b]
            self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
            self.cache = None
    
        def forward(self, x, h_prev, c_prev):
            Wx, Wh, b = self.params
            N, H = h_prev.shape
    
            A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b
    
            f = A[:, :H]
            g = A[:, H:2*H]
            i = A[:, 2*H:3*H]
            o = A[:, 3*H:]
    
            f = sigmoid(f)
            g = np.tanh(g)
            i = sigmoid(i)
            o = sigmoid(o)
    
            c_next = f * c_prev + g * i
            h_next = o * np.tanh(c_next)
    
            self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
            return h_next, c_next
    
        def backward(self, dh_next, dc_next):
            Wx, Wh, b = self.params
            x, h_prev, c_prev, i, f, g, o, c_next = self.cache
    
            tanh_c_next = np.tanh(c_next)
    
            ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)
    
            dc_prev = ds * f
    
            di = ds * g
            df = ds * c_prev
            do = dh_next * tanh_c_next
            dg = ds * i
    
            di *= i * (1 - i)
            df *= f * (1 - f)
            do *= o * (1 - o)
            dg *= (1 - g ** 2)
    
            dA = np.hstack((df, dg, di, do))
    
            dWh = np.dot(h_prev.T, dA)
            dWx = np.dot(x.T, dA)
            db = dA.sum(axis=0)
    
            self.grads[0][...] = dWx
            self.grads[1][...] = dWh
            self.grads[2][...] = db
    
            dx = np.dot(dA, Wx.T)
            dh_prev = np.dot(dA, Wh.T)
    
            return dx, dh_prev, dc_prev

    %python3
    class TimeLSTM:
        def __init__(self, Wx, Wh, b, stateful=False):
            self.params = [Wx, Wh, b]
            self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
            self.layers = None
    
            self.h, self.c = None, None
            self.dh = None
            self.stateful = stateful
    
        def forward(self, xs):
            Wx, Wh, b = self.params
            N, T, D = xs.shape
            H = Wh.shape[0]
    
            self.layers = []
            hs = np.empty((N, T, H), dtype='f')
    
            if not self.stateful or self.h is None:
                self.h = np.zeros((N, H), dtype='f')
            if not self.stateful or self.c is None:
                self.c = np.zeros((N, H), dtype='f')
    
            for t in range(T):
                layer = LSTM(*self.params)
                self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
                hs[:, t, :] = self.h
    
                self.layers.append(layer)
    
            return hs
    
        def backward(self, dhs):
            Wx, Wh, b = self.params
            N, T, H = dhs.shape
            D = Wx.shape[0]
    
            dxs = np.empty((N, T, D), dtype='f')
            dh, dc = 0, 0
    
            grads = [0, 0, 0]
            for t in reversed(range(T)):
                layer = self.layers[t]
                dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
                dxs[:, t, :] = dx
                for i, grad in enumerate(layer.grads):
                    grads[i] += grad
    
            for i, grad in enumerate(grads):
                self.grads[i][...] = grad
            self.dh = dh
            return dxs
    
        def set_state(self, h, c=None):
            self.h, self.c = h, c
    
        def reset_state(self):
            self.h, self.c = None, None

    Rnnlm

    %python3
    import pickle
    from common.time_layers import TimeSoftmaxWithLoss, TimeEmbedding, TimeAffine
    
    class Rnnlm:
        def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100):
            V, D, H = vocab_size, wordvec_size, hidden_size
            rn = np.random.randn
            
            embed_W = (rn(V, D) / 100).astype('f')
            lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
            lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
            lstm_b = np.zeros(4 * H).astype('f')
            affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
            affine_b = np.zeros(V).astype('f')
            
            self.layers = [
                TimeEmbedding(embed_W),
                TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True),
                TimeAffine(affine_W, affine_b)
            ]
            self.loss_layer = TimeSoftmaxWithLoss()
            self.lstm_layer = self.layers[1]
            
            self.params, self.grads = [], []
            for layer in self.layers:
                self.params += layer.params
                self.grads += layer.grads
    
        def predict(self, xs):
            for layer in self.layers:
                xs = layer.forward(xs)
            return xs
        
        def forward(self, xs, ts):
            score = self.predict(xs)
            loss = self.loss_layer.forward(score, ts)
            return loss
        
        def backward(self, dout=1):
            dout = self.loss_layer.backward(dout)
            for layer in reversed(self.layers):
                dout = layer.backward(dout)
            return dout
        
        def reset_state(self):
            self.lstm_layer.reset_state()
        
        def save_params(self, file_name='Rnnlm.pkl'):
            with open(file_name, 'wb') as f:
                pickle.dump(self.params, f)
        
        def load_params(self, file_name='Rnnlm.pkl'):
            with open(file_name, 'rb') as f:
                self.params = pickle.load(f)

    BetterRnnlm

    Rnnlmとの違い:

    • LSTM レイヤの多層化
    • Dropout を使用
    • 重みを共有

    %python3
    from common.time_layers import TimeEmbedding, TimeDropout, TimeAffine, TimeSoftmaxWithLoss
    from common.np import *
    from common.base_model import BaseModel
    
    
    class BetterRnnlm(BaseModel):
        def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5):
            V, D, H = vocab_size, wordvec_size, hidden_size
            rn = np.random.randn
            
            embed_W = (rn(V, D) / 100).astype('f')
            lstm_Wx1 = (rn(D, 4*H) / np.sqrt(D)).astype('f')
            lstm_Wh1 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
            lstm_b1 = np.zeros(4*H).astype('f')
            lstm_Wx2 = (rn(D, 4*H) / np.sqrt(D)).astype('f')
            lstm_Wh2 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
            lstm_b2 = np.zeros(4*H).astype('f')
            affine_b = np.zeros(V).astype('f')
            
            self.layers = [
                TimeEmbedding(embed_W),
                TimeDropout(dropout_ratio),
                TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
                TimeDropout(dropout_ratio),
                TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
                TimeDropout(dropout_ratio),
                TimeAffine(embed_W.T, affine_b)
            ]
            self.loss_layer = TimeSoftmaxWithLoss()
            self.lstm_layers = [self.layers[2], self.layers[4]]
            self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]
            
            self.params, self.grads = [], []
            for layer in self.layers:
                self.params += layer.params
                self.grads += layer.grads
        
        def predict(self, xs, train_flg=False):
            for layer in self.drop_layers:
                layer.train_flg = train_flg
            for layer in self.layers:
                xs = layer.forward(xs)
            return xs
        
        def forward(self, xs, ts, train_flg=True):
            score = self.predict(xs, train_flg)
            loss = self.loss_layer.forward(score, ts)
            return loss
        
        def backward(self, dout=1):
            dout = self.loss_layer.backward(dout)
            for layer in reversed(self.layers):
                dout = layer.backward(dout)
            return dout
        
        def reset_state(self):
            for layer in self.lstm_layers:
                layer.reset_state()

    文章生成

    • np.random.choice は指定の確率分布に従って適当に選ぶやつ

    %python3
    import numpy as np
    from common.functions import softmax
    
    
    class RnnlmGen(Rnnlm):
        def generate(self, start_id, skip_ids=None, sample_size=100):
            word_ids = [start_id]
            
            x = start_id
            while len(word_ids) < sample_size:
                x = np.array(x).reshape(1, 1)
                score = self.predict(x)
                p = softmax(score.flatten())
                
                sampled = np.random.choice(len(p), size=1, p=p)
                
                if (skip_ids is None) or (sampled not in skip_ids):
                    x = sampled
                    word_ids.append(int(x))
            
            return word_ids

    %python3
    from dataset import ptb
    
    
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)
    corpus_size = len(corpus)
    
    model = RnnlmGen()
    
    start_word = 'you'
    start_id = word_to_id[start_word]
    skip_words = ['N', '<unk>', '$']
    skip_ids = [word_to_id[w] for w in skip_words]
    
    word_ids = model.generate(start_id, skip_ids)
    txt = ' '.join([id_to_word[i] for i in word_ids])
    txt = txt.replace(' <eos>', '.\n')
    
    print(txt)
    you march instrument quickly dual overly t. resident shirts benign attendants takeover-stock accountability guard deregulation recoup mean corners impressed operation negotiated incorrectly reservations ssangyong mandate discretion alliances touched authority cathcart know-how treaty disagreement falls dai-ichi polyethylene multiple diplomat goldsmith airplanes murdoch durkin cool naturally truce setbacks small heating rico crazy scarce confronted circumstances leslie force said prudential arms cholesterol happening surfaced parties security tendered week declaring earthquake intimate visible backing rank seismic hair divisive know-how prints yard whooping worry hills accepting mich. banning free-market 500-stock charity earlier integrity place leads mediator benjamin indicated alleviate kick concert desire conditions budgetary chose
    

    • 上の結果はトレーニングしてないモデルの出力なのですごい適当

    %python3
    model.load_params('/tmp/deep-learning-from-scratch-2/ch06/Rnnlm.pkl')

    %python3
    start_word = 'you'
    start_id = word_to_id[start_word]
    skip_words = ['N', '<unk>', '$']
    skip_ids = [word_to_id[w] for w in skip_words]
    
    word_ids = model.generate(start_id, skip_ids)
    txt = ' '.join([id_to_word[i] for i in word_ids])
    txt = txt.replace(' <eos>', '.\n')
    
    print(txt)
    you serve illinois wiped reportedly mid-1980s substantially f emhart expression macmillan than automatic derivative appetite rhone-poulenc spots following wisconsin 1960s computer-driven ec application massage 's harmful worse announced deliberately mission wars institutional ehrlich chosen exception hut arbitragers stepping compete chris supplying carbide initiatives went feelings overhead customers today accelerated unfilled system iras drinks low licenses judge doctor rural widen look cms hotels products towns hearst grown dominion placing tell privately workstation responsibility formation unpublished cloud makers stick allies jay conspiring co. nih gen-probe confronted bankers unfriendly become magnified calgary enserch apt abortion beauty reruns deal constitution itself rebounded fare jeff
    

    • トレーニング済みの重みを入れてもそんなに変わらず

    %sh
    curl https://www.oreilly.co.jp/pub/9784873118369/BetterRnnlm.pkl > /tmp/BetterRnnlm.pkl
      % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                     Dload  Upload   Total   Spent    Left  Speed
    
      0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
      0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
      0 37.7M    0  236k    0     0   159k      0  0:04:02  0:00:01  0:04:01  159k
      6 37.7M    6 2460k    0     0   985k      0  0:00:39  0:00:02  0:00:37  984k
     13 37.7M   13 5164k    0     0  1474k      0  0:00:26  0:00:03  0:00:23 1474k
     18 37.7M   18 7180k    0     0  1593k      0  0:00:24  0:00:04  0:00:20 1593k
     24 37.7M   24 9372k    0     0  1700k      0  0:00:22  0:00:05  0:00:17 1860k
     30 37.7M   30 11.3M    0     0  1789k      0  0:00:21  0:00:06  0:00:15 2271k
     36 37.7M   36 13.6M    0     0  1860k      0  0:00:20  0:00:07  0:00:13 2296k
     42 37.7M   42 15.9M    0     0  1913k      0  0:00:20  0:00:08  0:00:12 2219k
     48 37.7M   48 18.1M    0     0  1958k      0  0:00:19  0:00:09  0:00:10 2288k
     54 37.7M   54 20.4M    0     0  1993k      0  0:00:19  0:00:10  0:00:09 2317k
     60 37.7M   60 22.7M    0     0  2026k      0  0:00:19  0:00:11  0:00:08 2336k
     66 37.7M   66 25.1M    0     0  2062k      0  0:00:18  0:00:12  0:00:06 2366k
     73 37.7M   73 27.7M    0     0  2110k      0  0:00:18  0:00:13  0:00:05 2450k
     81 37.7M   81 30.6M    0     0  2165k      0  0:00:17  0:00:14  0:00:03 2559k
     88 37.7M   88 33.3M    0     0  2207k      0  0:00:17  0:00:15  0:00:02 2659k
     95 37.7M   95 35.9M    0     0  2233k      0  0:00:17  0:00:16  0:00:01 2710k
    100 37.7M  100 37.7M    0     0  2240k      0  0:00:17  0:00:17 --:--:-- 2710k
    

    %python3
    import numpy as np
    from common.functions import softmax
    
    
    class BetterRnnlmGen(BetterRnnlm):
        def generate(self, start_id, skip_ids=None, sample_size=100):
            word_ids = [start_id]
            
            x = start_id
            while len(word_ids) < sample_size:
                x = np.array(x).reshape(1, 1)
                score = self.predict(x)
                p = softmax(score.flatten())
                
                sampled = np.random.choice(len(p), size=1, p=p)
                
                if (skip_ids is None) or (sampled not in skip_ids):
                    x = sampled
                    word_ids.append(int(x))
            
            return word_ids

    %python3
    model = BetterRnnlmGen()
    model.load_params('/tmp/BetterRnnlm.pkl')
    
    start_word = 'you'
    start_id = word_to_id[start_word]
    skip_words = ['N', '<unk>', '$']
    skip_ids = [word_to_id[w] for w in skip_words]
    
    word_ids = model.generate(start_id, skip_ids)
    txt = ' '.join([id_to_word[i] for i in word_ids])
    txt = txt.replace(' <eos>', '.\n')
    
    print(txt)
    you want to raise their eggs.
     the gradual results exceeding the market have lent an unprecedented number of plants held during the past five years at the top rate of economists and energy casualty sales april.
     the bank also said output of unfilled orders have hit a surge in expenses of business as an increase of sales by the navy 's and prepared assets.
     eight business projects went across the area ahead the new package cited texas continental corp. 's chairman frederick a. robinson and nl.
     these days drexel 's clients are a brand attitude to
    

    • BetterRnnlmの方はだいぶ英語っぽい感じになっている

    7.2: seq2seq

    足し算を文字のリストとみて結果への変換をやる

    使用するデータセット

    %sh
    cat /tmp/deep-learning-from-scratch-2/dataset/addition.txt | head
    16+75  _91  
    52+607 _659 
    75+22  _97  
    63+22  _85  
    795+3  _798 
    706+796_1502
    8+4    _12  
    84+317 _401 
    9+3    _12  
    6+2    _8   
    

    %python3
    from dataset import sequence
    
    (x_train, t_train), (x_test, t_test) = \
        sequence.load_data('addition.txt', seed=1984)
    char_to_id, id_to_char = sequence.get_vocab()
    
    print(x_train.shape, t_train.shape)
    print(x_test.shape, t_test.shape)
    
    print(x_train[0])
    print(t_train[0])
    
    print(''.join([id_to_char[c] for c in x_train[0]]))
    print(''.join([id_to_char[c] for c in t_train[0]]))
    (45000, 7) (45000, 5)
    (5000, 7) (5000, 5)
    [ 3  0  2  0  0 11  5]
    [ 6  0 11  7  5]
    71+118 
    _189 
    

    Encoderの実装

    %python3
    class Encoder:
        def __init__(self, vocab_size, wordvec_size, hidden_size):
            V, D, H = vocab_size, wordvec_size, hidden_size
            rn = np.random.randn
            
            embed_W = (rn(V, D) / 100).astype('f')
            lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
            lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
            lstm_b = np.zeros(4 * H).astype('f')
            
            self.embed = TimeEmbedding(embed_W)
            self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)
            
            self.params = self.embed.params + self.lstm.params
            self.grads = self.embed.grads + self.lstm.grads
            self.hs = None
        
        def forward(self, xs):
            xs = self.embed.forward(xs)
            hs = self.lstm.forward(xs)
            self.hs = hs
            return hs[:, -1, :]
        
        def backward(self, dh):
            dhs = np.zeros_like(self.hs)
            dhs[:, -1, :] = dh
            
            dout = self.lstm.backward(dhs)
            dout = self.embed.backward(dout)
            return dout

    %python3
    class Decoder:
        def __init__(self, vocab_size, wordvec_size, hidden_size):
            V, D, H = vocab_size, wordvec_size, hidden_size
            rn = np.random.randn
            
            embed_W = (rn(V, D) / 100).astype('f')
            lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
            lstm_Wh = (rn(H, 4 * H) / np.sqrt(D)).astype('f')
            lstm_b = np.zeros(4 * H).astype('f')
            affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
            affine_b = np.zeros(V).astype('f')
            
            self.embed = TimeEmbedding(embed_W)
            self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
            self.affine = TimeAffine(affine_W, affine_b)
            
            self.params, self.grads = [], []
            
            for layer in (self.embed, self.lstm, self.affine):
                self.params += layer.params
                self.grads += layer.grads
        
        def forward(self, xs, h):
            self.lstm.set_state(h)
            
            out = self.embed.forward(xs)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)
            return score
        
        def backward(self, dscore):
            dout = self.affine.backward(dscore)
            dout = self.lstm.backward(dout)
            dout = self.embed.backward(dout)
            dh = self.lstm.dh
            return dh
        
        def generate(self, h, start_id, sample_size):
            sampled = []
            sample_id = start_id
            self.lstm.set_state(h)
            
            for _ in range(sample_size):
                x = np.array(sample_id).reshape((1, 1))
                out = self.embed.forward(x)
                out = self.lstm.forward(out)
                score = self.affine.forward(out)
                
                sample_id = np.argmax(score.flatten())
                sampled.append(int(sample_id))
            
            return sampled

    %python3
    from common.base_model import BaseModel
    
    class Seq2Seq(BaseModel):
        def __init__(self, vocab_size, wordvec_size, hidden_size):
            V, D, H = vocab_size, wordvec_size, hidden_size
            self.encoder = Encoder(V, D, H)
            self.decoder = Decoder(V, D, H)
            self.softmax = TimeSoftmaxWithLoss()
            
            self.params = self.encoder.params + self.decoder.params
            self.grads = self.encoder.grads + self.decoder.grads
        
        def forward(self, xs, ts):
            decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]
            
            h = self.encoder.forward(xs)
            score = self.decoder.forward(decoder_xs, h)
            loss = self.softmax.forward(score, decoder_ts)
            return loss
        
        def backward(self, dout=1):
            dout = self.softmax.backward(dout)
            dh = self.decoder.backward(dout)
            dout = self.encoder.backward(dh)
            return dout
        
        def generate(self, xs, start_id, sample_size):
            h = self.encoder.forward(xs)
            sampled = self.decoder.generate(h, start_id, sample_size)
            return sampled

    Seq2Seqモデルのトレーニング

    %python3
    import numpy as np
    import matplotlib.pyplot as plt
    from dataset import sequence
    from common.optimizer import Adam
    from common.trainer import Trainer
    from common.util import eval_seq2seq
    
    (x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
    char_to_id, id_to_char = sequence.get_vocab()
    
    vocab_size = len(char_to_id)
    wordvec_size = 16
    hidden_size = 128
    batch_size = 128
    max_epoch = 25
    max_grad = 5.0
    
    model = Seq2Seq(vocab_size, wordvec_size, hidden_size)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)
    
    acc_list = []
    for epoch in range(max_epoch):
        trainer.fit(x_train, t_train, max_epoch=1,
                    batch_size=batch_size, max_grad=max_grad)
    
        correct_num = 0
        for i in range(len(x_test)):
            question, correct = x_test[[i]], t_test[[i]]
            verbose = i < 10
            correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
        
        acc = float(correct_num) / len(x_test)
        acc_list.append(acc)
        print('val acc %.3f%%' % (acc * 100))
    | epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.56
    | epoch 1 |  iter 21 / 351 | time 1[s] | loss 2.44
    | epoch 1 |  iter 41 / 351 | time 2[s] | loss 2.07
    | epoch 1 |  iter 61 / 351 | time 3[s] | loss 1.93
    | epoch 1 |  iter 81 / 351 | time 4[s] | loss 1.88
    | epoch 1 |  iter 101 / 351 | time 5[s] | loss 1.82
    | epoch 1 |  iter 121 / 351 | time 6[s] | loss 1.80
    | epoch 1 |  iter 141 / 351 | time 8[s] | loss 1.78
    | epoch 1 |  iter 161 / 351 | time 9[s] | loss 1.77
    | epoch 1 |  iter 181 / 351 | time 10[s] | loss 1.76
    | epoch 1 |  iter 201 / 351 | time 11[s] | loss 1.76
    | epoch 1 |  iter 221 / 351 | time 12[s] | loss 1.75
    | epoch 1 |  iter 241 / 351 | time 13[s] | loss 1.74
    | epoch 1 |  iter 261 / 351 | time 14[s] | loss 1.74
    | epoch 1 |  iter 281 / 351 | time 16[s] | loss 1.73
    | epoch 1 |  iter 301 / 351 | time 17[s] | loss 1.72
    | epoch 1 |  iter 321 / 351 | time 18[s] | loss 1.72
    | epoch 1 |  iter 341 / 351 | time 19[s] | loss 1.72
    Q 77+85  
    T 162 
    ☒ 100 
    ---
    Q 975+164
    T 1139
    ☒ 1000
    ---
    Q 582+84 
    T 666 
    ☒ 100 
    ---
    Q 8+155  
    T 163 
    ☒ 100 
    ---
    Q 367+55 
    T 422 
    ☒ 100 
    ---
    Q 600+257
    T 857 
    ☒ 1000
    ---
    Q 761+292
    T 1053
    ☒ 1000
    ---
    Q 830+597
    T 1427
    ☒ 1000
    ---
    Q 26+838 
    T 864 
    ☒ 100 
    ---
    Q 143+93 
    T 236 
    ☒ 200 
    ---
    val acc 0.220%
    | epoch 2 |  iter 1 / 351 | time 0[s] | loss 1.72
    | epoch 2 |  iter 21 / 351 | time 1[s] | loss 1.71
    | epoch 2 |  iter 41 / 351 | time 2[s] | loss 1.72
    | epoch 2 |  iter 61 / 351 | time 3[s] | loss 1.70
    | epoch 2 |  iter 81 / 351 | time 4[s] | loss 1.69
    | epoch 2 |  iter 101 / 351 | time 5[s] | loss 1.70
    | epoch 2 |  iter 121 / 351 | time 7[s] | loss 1.69
    | epoch 2 |  iter 141 / 351 | time 8[s] | loss 1.69
    | epoch 2 |  iter 161 / 351 | time 9[s] | loss 1.68
    | epoch 2 |  iter 181 / 351 | time 10[s] | loss 1.68
    | epoch 2 |  iter 201 / 351 | time 11[s] | loss 1.68
    | epoch 2 |  iter 221 / 351 | time 12[s] | loss 1.68
    | epoch 2 |  iter 241 / 351 | time 13[s] | loss 1.66
    | epoch 2 |  iter 261 / 351 | time 15[s] | loss 1.65
    | epoch 2 |  iter 281 / 351 | time 16[s] | loss 1.64
    | epoch 2 |  iter 301 / 351 | time 17[s] | loss 1.63
    | epoch 2 |  iter 321 / 351 | time 18[s] | loss 1.61
    | epoch 2 |  iter 341 / 351 | time 19[s] | loss 1.59
    Q 77+85  
    T 162 
    ☒ 100 
    ---
    Q 975+164
    T 1139
    ☒ 1000
    ---
    Q 582+84 
    T 666 
    ☒ 700 
    ---
    Q 8+155  
    T 163 
    ☒ 100 
    ---
    Q 367+55 
    T 422 
    ☒ 400 
    ---
    Q 600+257
    T 857 
    ☒ 800 
    ---
    Q 761+292
    T 1053
    ☒ 1000
    ---
    Q 830+597
    T 1427
    ☒ 1207
    ---
    Q 26+838 
    T 864 
    ☒ 700 
    ---
    Q 143+93 
    T 236 
    ☒ 400 
    ---
    val acc 0.220%
    | epoch 3 |  iter 1 / 351 | time 0[s] | loss 1.57
    | epoch 3 |  iter 21 / 351 | time 1[s] | loss 1.57
    | epoch 3 |  iter 41 / 351 | time 2[s] | loss 1.55
    | epoch 3 |  iter 61 / 351 | time 3[s] | loss 1.54
    | epoch 3 |  iter 81 / 351 | time 4[s] | loss 1.52
    | epoch 3 |  iter 101 / 351 | time 5[s] | loss 1.51
    | epoch 3 |  iter 121 / 351 | time 7[s] | loss 1.49
    | epoch 3 |  iter 141 / 351 | time 8[s] | loss 1.48
    | epoch 3 |  iter 161 / 351 | time 9[s] | loss 1.46
    | epoch 3 |  iter 181 / 351 | time 10[s] | loss 1.44
    | epoch 3 |  iter 201 / 351 | time 11[s] | loss 1.43
    | epoch 3 |  iter 221 / 351 | time 12[s] | loss 1.41
    | epoch 3 |  iter 241 / 351 | time 14[s] | loss 1.39
    | epoch 3 |  iter 261 / 351 | time 15[s] | loss 1.39
    | epoch 3 |  iter 281 / 351 | time 16[s] | loss 1.37
    | epoch 3 |  iter 301 / 351 | time 17[s] | loss 1.37
    | epoch 3 |  iter 321 / 351 | time 18[s] | loss 1.35
    | epoch 3 |  iter 341 / 351 | time 20[s] | loss 1.34
    Q 77+85  
    T 162 
    ☒ 136 
    ---
    Q 975+164
    T 1139
    ☒ 1169
    ---
    Q 582+84 
    T 666 
    ☒ 668 
    ---
    Q 8+155  
    T 163 
    ☒ 128 
    ---
    Q 367+55 
    T 422 
    ☒ 446 
    ---
    Q 600+257
    T 857 
    ☒ 839 
    ---
    Q 761+292
    T 1053
    ☒ 1009
    ---
    Q 830+597
    T 1427
    ☒ 1468
    ---
    Q 26+838 
    T 864 
    ☒ 808 
    ---
    Q 143+93 
    T 236 
    ☒ 228 
    ---
    val acc 0.940%
    | epoch 4 |  iter 1 / 351 | time 0[s] | loss 1.34
    | epoch 4 |  iter 21 / 351 | time 1[s] | loss 1.33
    | epoch 4 |  iter 41 / 351 | time 2[s] | loss 1.32
    | epoch 4 |  iter 61 / 351 | time 3[s] | loss 1.30
    | epoch 4 |  iter 81 / 351 | time 4[s] | loss 1.30
    | epoch 4 |  iter 101 / 351 | time 5[s] | loss 1.29
    | epoch 4 |  iter 121 / 351 | time 7[s] | loss 1.28
    | epoch 4 |  iter 141 / 351 | time 8[s] | loss 1.27
    | epoch 4 |  iter 161 / 351 | time 9[s] | loss 1.26
    | epoch 4 |  iter 181 / 351 | time 10[s] | loss 1.25
    | epoch 4 |  iter 201 / 351 | time 11[s] | loss 1.25
    | epoch 4 |  iter 221 / 351 | time 13[s] | loss 1.25
    | epoch 4 |  iter 241 / 351 | time 14[s] | loss 1.23
    | epoch 4 |  iter 261 / 351 | time 15[s] | loss 1.22
    | epoch 4 |  iter 281 / 351 | time 16[s] | loss 1.22
    | epoch 4 |  iter 301 / 351 | time 17[s] | loss 1.21
    | epoch 4 |  iter 321 / 351 | time 18[s] | loss 1.20
    | epoch 4 |  iter 341 / 351 | time 20[s] | loss 1.20
    Q 77+85  
    T 162 
    ☒ 156 
    ---
    Q 975+164
    T 1139
    ☒ 1222
    ---
    Q 582+84 
    T 666 
    ☑ 666 
    ---
    Q 8+155  
    T 163 
    ☒ 199 
    ---
    Q 367+55 
    T 422 
    ☒ 402 
    ---
    Q 600+257
    T 857 
    ☒ 902 
    ---
    Q 761+292
    T 1053
    ☒ 1006
    ---
    Q 830+597
    T 1427
    ☒ 1525
    ---
    Q 26+838 
    T 864 
    ☒ 826 
    ---
    Q 143+93 
    T 236 
    ☒ 205 
    ---
    val acc 2.400%
    | epoch 5 |  iter 1 / 351 | time 0[s] | loss 1.19
    | epoch 5 |  iter 21 / 351 | time 1[s] | loss 1.18
    | epoch 5 |  iter 41 / 351 | time 2[s] | loss 1.17
    | epoch 5 |  iter 61 / 351 | time 3[s] | loss 1.16
    | epoch 5 |  iter 81 / 351 | time 4[s] | loss 1.17
    | epoch 5 |  iter 101 / 351 | time 5[s] | loss 1.15
    | epoch 5 |  iter 121 / 351 | time 7[s] | loss 1.15
    | epoch 5 |  iter 141 / 351 | time 8[s] | loss 1.15
    | epoch 5 |  iter 161 / 351 | time 9[s] | loss 1.13
    | epoch 5 |  iter 181 / 351 | time 10[s] | loss 1.13
    | epoch 5 |  iter 201 / 351 | time 11[s] | loss 1.13
    | epoch 5 |  iter 221 / 351 | time 12[s] | loss 1.12
    | epoch 5 |  iter 241 / 351 | time 14[s] | loss 1.12
    | epoch 5 |  iter 261 / 351 | time 15[s] | loss 1.11
    | epoch 5 |  iter 281 / 351 | time 16[s] | loss 1.11
    | epoch 5 |  iter 301 / 351 | time 17[s] | loss 1.10
    | epoch 5 |  iter 321 / 351 | time 18[s] | loss 1.09
    | epoch 5 |  iter 341 / 351 | time 20[s] | loss 1.09
    Q 77+85  
    T 162 
    ☒ 155 
    ---
    Q 975+164
    T 1139
    ☒ 1165
    ---
    Q 582+84 
    T 666 
    ☒ 645 
    ---
    Q 8+155  
    T 163 
    ☒ 160 
    ---
    Q 367+55 
    T 422 
    ☒ 421 
    ---
    Q 600+257
    T 857 
    ☒ 882 
    ---
    Q 761+292
    T 1053
    ☒ 1015
    ---
    Q 830+597
    T 1427
    ☒ 1444
    ---
    Q 26+838 
    T 864 
    ☒ 846 
    ---
    Q 143+93 
    T 236 
    ☒ 221 
    ---
    val acc 4.360%
    | epoch 6 |  iter 1 / 351 | time 0[s] | loss 1.06
    | epoch 6 |  iter 21 / 351 | time 1[s] | loss 1.07
    | epoch 6 |  iter 41 / 351 | time 2[s] | loss 1.07
    | epoch 6 |  iter 61 / 351 | time 3[s] | loss 1.08
    | epoch 6 |  iter 81 / 351 | time 4[s] | loss 1.08
    | epoch 6 |  iter 101 / 351 | time 5[s] | loss 1.08
    | epoch 6 |  iter 121 / 351 | time 7[s] | loss 1.06
    | epoch 6 |  iter 141 / 351 | time 8[s] | loss 1.05
    | epoch 6 |  iter 161 / 351 | time 9[s] | loss 1.05
    | epoch 6 |  iter 181 / 351 | time 10[s] | loss 1.06
    | epoch 6 |  iter 201 / 351 | time 11[s] | loss 1.05
    | epoch 6 |  iter 221 / 351 | time 12[s] | loss 1.05
    | epoch 6 |  iter 241 / 351 | time 14[s] | loss 1.04
    | epoch 6 |  iter 261 / 351 | time 15[s] | loss 1.04
    | epoch 6 |  iter 281 / 351 | time 16[s] | loss 1.04
    | epoch 6 |  iter 301 / 351 | time 17[s] | loss 1.03
    | epoch 6 |  iter 321 / 351 | time 18[s] | loss 1.04
    | epoch 6 |  iter 341 / 351 | time 20[s] | loss 1.01
    Q 77+85  
    T 162 
    ☒ 161 
    ---
    Q 975+164
    T 1139
    ☒ 1119
    ---
    Q 582+84 
    T 666 
    ☑ 666 
    ---
    Q 8+155  
    T 163 
    ☒ 166 
    ---
    Q 367+55 
    T 422 
    ☒ 410 
    ---
    Q 600+257
    T 857 
    ☑ 857 
    ---
    Q 761+292
    T 1053
    ☒ 1009
    ---
    Q 830+597
    T 1427
    ☒ 1412
    ---
    Q 26+838 
    T 864 
    ☒ 867 
    ---
    Q 143+93 
    T 236 
    ☒ 246 
    ---
    val acc 4.400%
    | epoch 7 |  iter 1 / 351 | time 0[s] | loss 1.02
    | epoch 7 |  iter 21 / 351 | time 1[s] | loss 1.01
    | epoch 7 |  iter 41 / 351 | time 2[s] | loss 1.00
    | epoch 7 |  iter 61 / 351 | time 3[s] | loss 1.00
    | epoch 7 |  iter 81 / 351 | time 4[s] | loss 1.00
    | epoch 7 |  iter 101 / 351 | time 5[s] | loss 1.00
    | epoch 7 |  iter 121 / 351 | time 7[s] | loss 1.00
    | epoch 7 |  iter 141 / 351 | time 8[s] | loss 0.98
    | epoch 7 |  iter 161 / 351 | time 9[s] | loss 1.00
    | epoch 7 |  iter 181 / 351 | time 10[s] | loss 0.99
    | epoch 7 |  iter 201 / 351 | time 11[s] | loss 0.98
    | epoch 7 |  iter 221 / 351 | time 13[s] | loss 1.00
    | epoch 7 |  iter 241 / 351 | time 14[s] | loss 1.02
    | epoch 7 |  iter 261 / 351 | time 15[s] | loss 1.00
    | epoch 7 |  iter 281 / 351 | time 16[s] | loss 0.97
    | epoch 7 |  iter 301 / 351 | time 17[s] | loss 0.97
    | epoch 7 |  iter 321 / 351 | time 18[s] | loss 0.96
    | epoch 7 |  iter 341 / 351 | time 20[s] | loss 0.96
    Q 77+85  
    T 162 
    ☒ 161 
    ---
    Q 975+164
    T 1139
    ☒ 1175
    ---
    Q 582+84 
    T 666 
    ☒ 667 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 430 
    ---
    Q 600+257
    T 857 
    ☒ 886 
    ---
    Q 761+292
    T 1053
    ☒ 1076
    ---
    Q 830+597
    T 1427
    ☒ 1444
    ---
    Q 26+838 
    T 864 
    ☒ 865 
    ---
    Q 143+93 
    T 236 
    ☒ 238 
    ---
    val acc 5.100%
    | epoch 8 |  iter 1 / 351 | time 0[s] | loss 1.01
    | epoch 8 |  iter 21 / 351 | time 1[s] | loss 0.95
    | epoch 8 |  iter 41 / 351 | time 2[s] | loss 0.96
    | epoch 8 |  iter 61 / 351 | time 3[s] | loss 0.95
    | epoch 8 |  iter 81 / 351 | time 4[s] | loss 0.95
    | epoch 8 |  iter 101 / 351 | time 5[s] | loss 0.96
    | epoch 8 |  iter 121 / 351 | time 7[s] | loss 0.95
    | epoch 8 |  iter 141 / 351 | time 8[s] | loss 0.95
    | epoch 8 |  iter 161 / 351 | time 9[s] | loss 0.95
    | epoch 8 |  iter 181 / 351 | time 10[s] | loss 0.94
    | epoch 8 |  iter 201 / 351 | time 11[s] | loss 0.93
    | epoch 8 |  iter 221 / 351 | time 13[s] | loss 0.93
    | epoch 8 |  iter 241 / 351 | time 14[s] | loss 0.93
    | epoch 8 |  iter 261 / 351 | time 15[s] | loss 0.95
    | epoch 8 |  iter 281 / 351 | time 16[s] | loss 0.94
    | epoch 8 |  iter 301 / 351 | time 18[s] | loss 0.92
    | epoch 8 |  iter 321 / 351 | time 19[s] | loss 0.92
    | epoch 8 |  iter 341 / 351 | time 20[s] | loss 0.92
    Q 77+85  
    T 162 
    ☒ 160 
    ---
    Q 975+164
    T 1139
    ☒ 1130
    ---
    Q 582+84 
    T 666 
    ☒ 668 
    ---
    Q 8+155  
    T 163 
    ☒ 158 
    ---
    Q 367+55 
    T 422 
    ☒ 420 
    ---
    Q 600+257
    T 857 
    ☒ 858 
    ---
    Q 761+292
    T 1053
    ☒ 1009
    ---
    Q 830+597
    T 1427
    ☒ 1431
    ---
    Q 26+838 
    T 864 
    ☒ 865 
    ---
    Q 143+93 
    T 236 
    ☒ 232 
    ---
    val acc 5.440%
    | epoch 9 |  iter 1 / 351 | time 0[s] | loss 0.92
    | epoch 9 |  iter 21 / 351 | time 1[s] | loss 0.91
    | epoch 9 |  iter 41 / 351 | time 2[s] | loss 0.90
    | epoch 9 |  iter 61 / 351 | time 3[s] | loss 0.90
    | epoch 9 |  iter 81 / 351 | time 4[s] | loss 0.89
    | epoch 9 |  iter 101 / 351 | time 5[s] | loss 0.91
    | epoch 9 |  iter 121 / 351 | time 7[s] | loss 0.90
    | epoch 9 |  iter 141 / 351 | time 8[s] | loss 0.89
    | epoch 9 |  iter 161 / 351 | time 9[s] | loss 0.94
    | epoch 9 |  iter 181 / 351 | time 10[s] | loss 0.90
    | epoch 9 |  iter 201 / 351 | time 12[s] | loss 0.90
    | epoch 9 |  iter 221 / 351 | time 13[s] | loss 0.91
    | epoch 9 |  iter 241 / 351 | time 14[s] | loss 0.89
    | epoch 9 |  iter 261 / 351 | time 15[s] | loss 0.90
    | epoch 9 |  iter 281 / 351 | time 16[s] | loss 0.90
    | epoch 9 |  iter 301 / 351 | time 17[s] | loss 0.88
    | epoch 9 |  iter 321 / 351 | time 19[s] | loss 0.87
    | epoch 9 |  iter 341 / 351 | time 20[s] | loss 0.87
    Q 77+85  
    T 162 
    ☒ 161 
    ---
    Q 975+164
    T 1139
    ☑ 1139
    ---
    Q 582+84 
    T 666 
    ☒ 667 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 427 
    ---
    Q 600+257
    T 857 
    ☒ 859 
    ---
    Q 761+292
    T 1053
    ☒ 1069
    ---
    Q 830+597
    T 1427
    ☒ 1421
    ---
    Q 26+838 
    T 864 
    ☒ 865 
    ---
    Q 143+93 
    T 236 
    ☒ 248 
    ---
    val acc 7.680%
    | epoch 10 |  iter 1 / 351 | time 0[s] | loss 0.84
    | epoch 10 |  iter 21 / 351 | time 1[s] | loss 0.86
    | epoch 10 |  iter 41 / 351 | time 2[s] | loss 0.87
    | epoch 10 |  iter 61 / 351 | time 3[s] | loss 0.87
    | epoch 10 |  iter 81 / 351 | time 4[s] | loss 0.86
    | epoch 10 |  iter 101 / 351 | time 6[s] | loss 0.86
    | epoch 10 |  iter 121 / 351 | time 7[s] | loss 0.86
    | epoch 10 |  iter 141 / 351 | time 8[s] | loss 0.87
    | epoch 10 |  iter 161 / 351 | time 9[s] | loss 0.85
    | epoch 10 |  iter 181 / 351 | time 10[s] | loss 0.88
    | epoch 10 |  iter 201 / 351 | time 11[s] | loss 0.85
    | epoch 10 |  iter 221 / 351 | time 13[s] | loss 0.86
    | epoch 10 |  iter 241 / 351 | time 14[s] | loss 0.86
    | epoch 10 |  iter 261 / 351 | time 15[s] | loss 0.85
    | epoch 10 |  iter 281 / 351 | time 16[s] | loss 0.85
    | epoch 10 |  iter 301 / 351 | time 17[s] | loss 0.84
    | epoch 10 |  iter 321 / 351 | time 18[s] | loss 0.84
    | epoch 10 |  iter 341 / 351 | time 20[s] | loss 0.84
    Q 77+85  
    T 162 
    ☒ 160 
    ---
    Q 975+164
    T 1139
    ☒ 1130
    ---
    Q 582+84 
    T 666 
    ☒ 663 
    ---
    Q 8+155  
    T 163 
    ☒ 165 
    ---
    Q 367+55 
    T 422 
    ☒ 420 
    ---
    Q 600+257
    T 857 
    ☒ 859 
    ---
    Q 761+292
    T 1053
    ☒ 1039
    ---
    Q 830+597
    T 1427
    ☒ 1409
    ---
    Q 26+838 
    T 864 
    ☒ 865 
    ---
    Q 143+93 
    T 236 
    ☒ 238 
    ---
    val acc 8.840%
    | epoch 11 |  iter 1 / 351 | time 0[s] | loss 0.80
    | epoch 11 |  iter 21 / 351 | time 1[s] | loss 0.85
    | epoch 11 |  iter 41 / 351 | time 2[s] | loss 0.83
    | epoch 11 |  iter 61 / 351 | time 3[s] | loss 0.83
    | epoch 11 |  iter 81 / 351 | time 4[s] | loss 0.83
    | epoch 11 |  iter 101 / 351 | time 6[s] | loss 0.82
    | epoch 11 |  iter 121 / 351 | time 7[s] | loss 0.82
    | epoch 11 |  iter 141 / 351 | time 8[s] | loss 0.81
    | epoch 11 |  iter 161 / 351 | time 9[s] | loss 0.81
    | epoch 11 |  iter 181 / 351 | time 11[s] | loss 0.81
    | epoch 11 |  iter 201 / 351 | time 12[s] | loss 0.81
    | epoch 11 |  iter 221 / 351 | time 13[s] | loss 0.82
    | epoch 11 |  iter 241 / 351 | time 15[s] | loss 0.81
    | epoch 11 |  iter 261 / 351 | time 16[s] | loss 0.81
    | epoch 11 |  iter 281 / 351 | time 17[s] | loss 0.82
    | epoch 11 |  iter 301 / 351 | time 18[s] | loss 0.83
    | epoch 11 |  iter 321 / 351 | time 19[s] | loss 0.80
    | epoch 11 |  iter 341 / 351 | time 21[s] | loss 0.81
    Q 77+85  
    T 162 
    ☒ 161 
    ---
    Q 975+164
    T 1139
    ☒ 1183
    ---
    Q 582+84 
    T 666 
    ☒ 658 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☑ 422 
    ---
    Q 600+257
    T 857 
    ☒ 851 
    ---
    Q 761+292
    T 1053
    ☒ 1073
    ---
    Q 830+597
    T 1427
    ☒ 1425
    ---
    Q 26+838 
    T 864 
    ☒ 861 
    ---
    Q 143+93 
    T 236 
    ☒ 238 
    ---
    val acc 8.020%
    | epoch 12 |  iter 1 / 351 | time 0[s] | loss 0.80
    | epoch 12 |  iter 21 / 351 | time 1[s] | loss 0.79
    | epoch 12 |  iter 41 / 351 | time 2[s] | loss 0.80
    | epoch 12 |  iter 61 / 351 | time 3[s] | loss 0.80
    | epoch 12 |  iter 81 / 351 | time 4[s] | loss 0.79
    | epoch 12 |  iter 101 / 351 | time 6[s] | loss 0.79
    | epoch 12 |  iter 121 / 351 | time 7[s] | loss 0.78
    | epoch 12 |  iter 141 / 351 | time 8[s] | loss 0.79
    | epoch 12 |  iter 161 / 351 | time 9[s] | loss 0.79
    | epoch 12 |  iter 181 / 351 | time 10[s] | loss 0.82
    | epoch 12 |  iter 201 / 351 | time 11[s] | loss 0.79
    | epoch 12 |  iter 221 / 351 | time 13[s] | loss 0.77
    | epoch 12 |  iter 241 / 351 | time 14[s] | loss 0.78
    | epoch 12 |  iter 261 / 351 | time 15[s] | loss 0.78
    | epoch 12 |  iter 281 / 351 | time 16[s] | loss 0.78
    | epoch 12 |  iter 301 / 351 | time 17[s] | loss 0.77
    | epoch 12 |  iter 321 / 351 | time 19[s] | loss 0.78
    | epoch 12 |  iter 341 / 351 | time 20[s] | loss 0.77
    Q 77+85  
    T 162 
    ☒ 161 
    ---
    Q 975+164
    T 1139
    ☒ 1129
    ---
    Q 582+84 
    T 666 
    ☒ 669 
    ---
    Q 8+155  
    T 163 
    ☒ 166 
    ---
    Q 367+55 
    T 422 
    ☒ 423 
    ---
    Q 600+257
    T 857 
    ☒ 859 
    ---
    Q 761+292
    T 1053
    ☒ 1039
    ---
    Q 830+597
    T 1427
    ☒ 1421
    ---
    Q 26+838 
    T 864 
    ☒ 867 
    ---
    Q 143+93 
    T 236 
    ☒ 238 
    ---
    val acc 12.200%
    | epoch 13 |  iter 1 / 351 | time 0[s] | loss 0.75
    | epoch 13 |  iter 21 / 351 | time 1[s] | loss 0.77
    | epoch 13 |  iter 41 / 351 | time 2[s] | loss 0.75
    | epoch 13 |  iter 61 / 351 | time 3[s] | loss 0.76
    | epoch 13 |  iter 81 / 351 | time 4[s] | loss 0.76
    | epoch 13 |  iter 101 / 351 | time 5[s] | loss 0.76
    | epoch 13 |  iter 121 / 351 | time 7[s] | loss 0.79
    | epoch 13 |  iter 141 / 351 | time 8[s] | loss 0.76
    | epoch 13 |  iter 161 / 351 | time 9[s] | loss 0.75
    | epoch 13 |  iter 181 / 351 | time 10[s] | loss 0.80
    | epoch 13 |  iter 201 / 351 | time 11[s] | loss 0.76
    | epoch 13 |  iter 221 / 351 | time 13[s] | loss 0.76
    | epoch 13 |  iter 241 / 351 | time 14[s] | loss 0.75
    | epoch 13 |  iter 261 / 351 | time 15[s] | loss 0.77
    | epoch 13 |  iter 281 / 351 | time 16[s] | loss 0.75
    | epoch 13 |  iter 301 / 351 | time 17[s] | loss 0.74
    | epoch 13 |  iter 321 / 351 | time 18[s] | loss 0.74
    | epoch 13 |  iter 341 / 351 | time 20[s] | loss 0.73
    Q 77+85  
    T 162 
    ☒ 160 
    ---
    Q 975+164
    T 1139
    ☑ 1139
    ---
    Q 582+84 
    T 666 
    ☒ 664 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 420 
    ---
    Q 600+257
    T 857 
    ☒ 851 
    ---
    Q 761+292
    T 1053
    ☒ 1063
    ---
    Q 830+597
    T 1427
    ☒ 1421
    ---
    Q 26+838 
    T 864 
    ☒ 861 
    ---
    Q 143+93 
    T 236 
    ☒ 239 
    ---
    val acc 12.460%
    | epoch 14 |  iter 1 / 351 | time 0[s] | loss 0.74
    | epoch 14 |  iter 21 / 351 | time 1[s] | loss 0.73
    | epoch 14 |  iter 41 / 351 | time 2[s] | loss 0.73
    | epoch 14 |  iter 61 / 351 | time 3[s] | loss 0.72
    | epoch 14 |  iter 81 / 351 | time 4[s] | loss 0.73
    | epoch 14 |  iter 101 / 351 | time 5[s] | loss 0.74
    | epoch 14 |  iter 121 / 351 | time 7[s] | loss 0.74
    | epoch 14 |  iter 141 / 351 | time 8[s] | loss 0.72
    | epoch 14 |  iter 161 / 351 | time 9[s] | loss 0.72
    | epoch 14 |  iter 181 / 351 | time 10[s] | loss 0.71
    | epoch 14 |  iter 201 / 351 | time 11[s] | loss 0.71
    | epoch 14 |  iter 221 / 351 | time 13[s] | loss 0.73
    | epoch 14 |  iter 241 / 351 | time 14[s] | loss 0.73
    | epoch 14 |  iter 261 / 351 | time 15[s] | loss 0.72
    | epoch 14 |  iter 281 / 351 | time 16[s] | loss 0.71
    | epoch 14 |  iter 301 / 351 | time 17[s] | loss 0.71
    | epoch 14 |  iter 321 / 351 | time 18[s] | loss 0.71
    | epoch 14 |  iter 341 / 351 | time 20[s] | loss 0.70
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1179
    ---
    Q 582+84 
    T 666 
    ☒ 658 
    ---
    Q 8+155  
    T 163 
    ☒ 166 
    ---
    Q 367+55 
    T 422 
    ☒ 420 
    ---
    Q 600+257
    T 857 
    ☒ 859 
    ---
    Q 761+292
    T 1053
    ☒ 1065
    ---
    Q 830+597
    T 1427
    ☒ 1418
    ---
    Q 26+838 
    T 864 
    ☒ 865 
    ---
    Q 143+93 
    T 236 
    ☒ 233 
    ---
    val acc 10.060%
    | epoch 15 |  iter 1 / 351 | time 0[s] | loss 0.73
    | epoch 15 |  iter 21 / 351 | time 1[s] | loss 0.70
    | epoch 15 |  iter 41 / 351 | time 2[s] | loss 0.71
    | epoch 15 |  iter 61 / 351 | time 3[s] | loss 0.69
    | epoch 15 |  iter 81 / 351 | time 4[s] | loss 0.70
    | epoch 15 |  iter 101 / 351 | time 5[s] | loss 0.69
    | epoch 15 |  iter 121 / 351 | time 7[s] | loss 0.69
    | epoch 15 |  iter 141 / 351 | time 8[s] | loss 0.70
    | epoch 15 |  iter 161 / 351 | time 9[s] | loss 0.71
    | epoch 15 |  iter 181 / 351 | time 10[s] | loss 0.73
    | epoch 15 |  iter 201 / 351 | time 11[s] | loss 0.73
    | epoch 15 |  iter 221 / 351 | time 13[s] | loss 0.72
    | epoch 15 |  iter 241 / 351 | time 14[s] | loss 0.71
    | epoch 15 |  iter 261 / 351 | time 15[s] | loss 0.69
    | epoch 15 |  iter 281 / 351 | time 16[s] | loss 0.71
    | epoch 15 |  iter 301 / 351 | time 17[s] | loss 0.68
    | epoch 15 |  iter 321 / 351 | time 18[s] | loss 0.69
    | epoch 15 |  iter 341 / 351 | time 20[s] | loss 0.69
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1130
    ---
    Q 582+84 
    T 666 
    ☒ 668 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 423 
    ---
    Q 600+257
    T 857 
    ☒ 851 
    ---
    Q 761+292
    T 1053
    ☒ 1062
    ---
    Q 830+597
    T 1427
    ☒ 1444
    ---
    Q 26+838 
    T 864 
    ☒ 861 
    ---
    Q 143+93 
    T 236 
    ☒ 238 
    ---
    val acc 14.080%
    | epoch 16 |  iter 1 / 351 | time 0[s] | loss 0.67
    | epoch 16 |  iter 21 / 351 | time 1[s] | loss 0.68
    | epoch 16 |  iter 41 / 351 | time 2[s] | loss 0.68
    | epoch 16 |  iter 61 / 351 | time 3[s] | loss 0.67
    | epoch 16 |  iter 81 / 351 | time 4[s] | loss 0.67
    | epoch 16 |  iter 101 / 351 | time 6[s] | loss 0.68
    | epoch 16 |  iter 121 / 351 | time 7[s] | loss 0.68
    | epoch 16 |  iter 141 / 351 | time 8[s] | loss 0.67
    | epoch 16 |  iter 161 / 351 | time 9[s] | loss 0.66
    | epoch 16 |  iter 181 / 351 | time 10[s] | loss 0.67
    | epoch 16 |  iter 201 / 351 | time 11[s] | loss 0.66
    | epoch 16 |  iter 221 / 351 | time 13[s] | loss 0.66
    | epoch 16 |  iter 241 / 351 | time 14[s] | loss 0.65
    | epoch 16 |  iter 261 / 351 | time 15[s] | loss 0.66
    | epoch 16 |  iter 281 / 351 | time 16[s] | loss 0.67
    | epoch 16 |  iter 301 / 351 | time 17[s] | loss 0.65
    | epoch 16 |  iter 321 / 351 | time 19[s] | loss 0.65
    | epoch 16 |  iter 341 / 351 | time 20[s] | loss 0.64
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1129
    ---
    Q 582+84 
    T 666 
    ☒ 669 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 420 
    ---
    Q 600+257
    T 857 
    ☒ 850 
    ---
    Q 761+292
    T 1053
    ☒ 1044
    ---
    Q 830+597
    T 1427
    ☒ 1418
    ---
    Q 26+838 
    T 864 
    ☒ 861 
    ---
    Q 143+93 
    T 236 
    ☒ 237 
    ---
    val acc 15.680%
    | epoch 17 |  iter 1 / 351 | time 0[s] | loss 0.65
    | epoch 17 |  iter 21 / 351 | time 1[s] | loss 0.64
    | epoch 17 |  iter 41 / 351 | time 2[s] | loss 0.70
    | epoch 17 |  iter 61 / 351 | time 3[s] | loss 0.69
    | epoch 17 |  iter 81 / 351 | time 4[s] | loss 0.64
    | epoch 17 |  iter 101 / 351 | time 5[s] | loss 0.63
    | epoch 17 |  iter 121 / 351 | time 7[s] | loss 0.64
    | epoch 17 |  iter 141 / 351 | time 8[s] | loss 0.64
    | epoch 17 |  iter 161 / 351 | time 9[s] | loss 0.65
    | epoch 17 |  iter 181 / 351 | time 10[s] | loss 0.65
    | epoch 17 |  iter 201 / 351 | time 11[s] | loss 0.64
    | epoch 17 |  iter 221 / 351 | time 13[s] | loss 0.64
    | epoch 17 |  iter 241 / 351 | time 14[s] | loss 0.65
    | epoch 17 |  iter 261 / 351 | time 15[s] | loss 0.65
    | epoch 17 |  iter 281 / 351 | time 16[s] | loss 0.64
    | epoch 17 |  iter 301 / 351 | time 17[s] | loss 0.64
    | epoch 17 |  iter 321 / 351 | time 19[s] | loss 0.64
    | epoch 17 |  iter 341 / 351 | time 20[s] | loss 0.66
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1138
    ---
    Q 582+84 
    T 666 
    ☑ 666 
    ---
    Q 8+155  
    T 163 
    ☒ 166 
    ---
    Q 367+55 
    T 422 
    ☒ 420 
    ---
    Q 600+257
    T 857 
    ☒ 859 
    ---
    Q 761+292
    T 1053
    ☒ 1044
    ---
    Q 830+597
    T 1427
    ☒ 1424
    ---
    Q 26+838 
    T 864 
    ☒ 861 
    ---
    Q 143+93 
    T 236 
    ☒ 238 
    ---
    val acc 16.240%
    | epoch 18 |  iter 1 / 351 | time 0[s] | loss 0.62
    | epoch 18 |  iter 21 / 351 | time 1[s] | loss 0.62
    | epoch 18 |  iter 41 / 351 | time 2[s] | loss 0.62
    | epoch 18 |  iter 61 / 351 | time 3[s] | loss 0.62
    | epoch 18 |  iter 81 / 351 | time 4[s] | loss 0.64
    | epoch 18 |  iter 101 / 351 | time 6[s] | loss 0.64
    | epoch 18 |  iter 121 / 351 | time 7[s] | loss 0.61
    | epoch 18 |  iter 141 / 351 | time 8[s] | loss 0.65
    | epoch 18 |  iter 161 / 351 | time 9[s] | loss 0.65
    | epoch 18 |  iter 181 / 351 | time 10[s] | loss 0.62
    | epoch 18 |  iter 201 / 351 | time 12[s] | loss 0.61
    | epoch 18 |  iter 221 / 351 | time 13[s] | loss 0.61
    | epoch 18 |  iter 241 / 351 | time 14[s] | loss 0.64
    | epoch 18 |  iter 261 / 351 | time 15[s] | loss 0.61
    | epoch 18 |  iter 281 / 351 | time 16[s] | loss 0.61
    | epoch 18 |  iter 301 / 351 | time 17[s] | loss 0.61
    | epoch 18 |  iter 321 / 351 | time 19[s] | loss 0.61
    | epoch 18 |  iter 341 / 351 | time 20[s] | loss 0.61
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1143
    ---
    Q 582+84 
    T 666 
    ☒ 661 
    ---
    Q 8+155  
    T 163 
    ☒ 162 
    ---
    Q 367+55 
    T 422 
    ☑ 422 
    ---
    Q 600+257
    T 857 
    ☒ 851 
    ---
    Q 761+292
    T 1053
    ☒ 1049
    ---
    Q 830+597
    T 1427
    ☒ 1424
    ---
    Q 26+838 
    T 864 
    ☒ 867 
    ---
    Q 143+93 
    T 236 
    ☒ 239 
    ---
    val acc 16.620%
    | epoch 19 |  iter 1 / 351 | time 0[s] | loss 0.62
    | epoch 19 |  iter 21 / 351 | time 1[s] | loss 0.63
    | epoch 19 |  iter 41 / 351 | time 2[s] | loss 0.61
    | epoch 19 |  iter 61 / 351 | time 3[s] | loss 0.59
    | epoch 19 |  iter 81 / 351 | time 4[s] | loss 0.60
    | epoch 19 |  iter 101 / 351 | time 5[s] | loss 0.61
    | epoch 19 |  iter 121 / 351 | time 7[s] | loss 0.60
    | epoch 19 |  iter 141 / 351 | time 8[s] | loss 0.59
    | epoch 19 |  iter 161 / 351 | time 9[s] | loss 0.60
    | epoch 19 |  iter 181 / 351 | time 10[s] | loss 0.59
    | epoch 19 |  iter 201 / 351 | time 11[s] | loss 0.58
    | epoch 19 |  iter 221 / 351 | time 13[s] | loss 0.60
    | epoch 19 |  iter 241 / 351 | time 14[s] | loss 0.58
    | epoch 19 |  iter 261 / 351 | time 15[s] | loss 0.59
    | epoch 19 |  iter 281 / 351 | time 16[s] | loss 0.59
    | epoch 19 |  iter 301 / 351 | time 17[s] | loss 0.59
    | epoch 19 |  iter 321 / 351 | time 19[s] | loss 0.59
    | epoch 19 |  iter 341 / 351 | time 20[s] | loss 0.59
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1129
    ---
    Q 582+84 
    T 666 
    ☑ 666 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 423 
    ---
    Q 600+257
    T 857 
    ☒ 852 
    ---
    Q 761+292
    T 1053
    ☑ 1053
    ---
    Q 830+597
    T 1427
    ☒ 1421
    ---
    Q 26+838 
    T 864 
    ☒ 867 
    ---
    Q 143+93 
    T 236 
    ☒ 235 
    ---
    val acc 19.760%
    | epoch 20 |  iter 1 / 351 | time 0[s] | loss 0.56
    | epoch 20 |  iter 21 / 351 | time 1[s] | loss 0.58
    | epoch 20 |  iter 41 / 351 | time 2[s] | loss 0.59
    | epoch 20 |  iter 61 / 351 | time 3[s] | loss 0.58
    | epoch 20 |  iter 81 / 351 | time 5[s] | loss 0.59
    | epoch 20 |  iter 101 / 351 | time 6[s] | loss 0.57
    | epoch 20 |  iter 121 / 351 | time 7[s] | loss 0.58
    | epoch 20 |  iter 141 / 351 | time 8[s] | loss 0.60
    | epoch 20 |  iter 161 / 351 | time 9[s] | loss 0.63
    | epoch 20 |  iter 181 / 351 | time 10[s] | loss 0.58
    | epoch 20 |  iter 201 / 351 | time 12[s] | loss 0.59
    | epoch 20 |  iter 221 / 351 | time 13[s] | loss 0.58
    | epoch 20 |  iter 241 / 351 | time 14[s] | loss 0.62
    | epoch 20 |  iter 261 / 351 | time 15[s] | loss 0.62
    | epoch 20 |  iter 281 / 351 | time 16[s] | loss 0.61
    | epoch 20 |  iter 301 / 351 | time 18[s] | loss 0.60
    | epoch 20 |  iter 321 / 351 | time 19[s] | loss 0.61
    | epoch 20 |  iter 341 / 351 | time 20[s] | loss 0.56
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1141
    ---
    Q 582+84 
    T 666 
    ☒ 665 
    ---
    Q 8+155  
    T 163 
    ☒ 164 
    ---
    Q 367+55 
    T 422 
    ☑ 422 
    ---
    Q 600+257
    T 857 
    ☒ 852 
    ---
    Q 761+292
    T 1053
    ☑ 1053
    ---
    Q 830+597
    T 1427
    ☒ 1424
    ---
    Q 26+838 
    T 864 
    ☒ 862 
    ---
    Q 143+93 
    T 236 
    ☒ 235 
    ---
    val acc 22.020%
    | epoch 21 |  iter 1 / 351 | time 0[s] | loss 0.55
    | epoch 21 |  iter 21 / 351 | time 1[s] | loss 0.56
    | epoch 21 |  iter 41 / 351 | time 2[s] | loss 0.56
    | epoch 21 |  iter 61 / 351 | time 3[s] | loss 0.57
    | epoch 21 |  iter 81 / 351 | time 4[s] | loss 0.56
    | epoch 21 |  iter 101 / 351 | time 5[s] | loss 0.56
    | epoch 21 |  iter 121 / 351 | time 7[s] | loss 0.58
    | epoch 21 |  iter 141 / 351 | time 8[s] | loss 0.58
    | epoch 21 |  iter 161 / 351 | time 9[s] | loss 0.55
    | epoch 21 |  iter 181 / 351 | time 10[s] | loss 0.57
    | epoch 21 |  iter 201 / 351 | time 11[s] | loss 0.56
    | epoch 21 |  iter 221 / 351 | time 13[s] | loss 0.56
    | epoch 21 |  iter 241 / 351 | time 14[s] | loss 0.55
    | epoch 21 |  iter 261 / 351 | time 15[s] | loss 0.56
    | epoch 21 |  iter 281 / 351 | time 16[s] | loss 0.58
    | epoch 21 |  iter 301 / 351 | time 17[s] | loss 0.56
    | epoch 21 |  iter 321 / 351 | time 18[s] | loss 0.55
    | epoch 21 |  iter 341 / 351 | time 20[s] | loss 0.57
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1144
    ---
    Q 582+84 
    T 666 
    ☒ 667 
    ---
    Q 8+155  
    T 163 
    ☒ 165 
    ---
    Q 367+55 
    T 422 
    ☒ 423 
    ---
    Q 600+257
    T 857 
    ☒ 850 
    ---
    Q 761+292
    T 1053
    ☒ 1055
    ---
    Q 830+597
    T 1427
    ☑ 1427
    ---
    Q 26+838 
    T 864 
    ☒ 867 
    ---
    Q 143+93 
    T 236 
    ☒ 237 
    ---
    val acc 14.560%
    | epoch 22 |  iter 1 / 351 | time 0[s] | loss 0.61
    | epoch 22 |  iter 21 / 351 | time 1[s] | loss 0.58
    | epoch 22 |  iter 41 / 351 | time 2[s] | loss 0.57
    | epoch 22 |  iter 61 / 351 | time 3[s] | loss 0.58
    | epoch 22 |  iter 81 / 351 | time 4[s] | loss 0.57
    | epoch 22 |  iter 101 / 351 | time 5[s] | loss 0.58
    | epoch 22 |  iter 121 / 351 | time 7[s] | loss 0.56
    | epoch 22 |  iter 141 / 351 | time 8[s] | loss 0.54
    | epoch 22 |  iter 161 / 351 | time 9[s] | loss 0.55
    | epoch 22 |  iter 181 / 351 | time 10[s] | loss 0.55
    | epoch 22 |  iter 201 / 351 | time 11[s] | loss 0.54
    | epoch 22 |  iter 221 / 351 | time 12[s] | loss 0.53
    | epoch 22 |  iter 241 / 351 | time 14[s] | loss 0.53
    | epoch 22 |  iter 261 / 351 | time 15[s] | loss 0.54
    | epoch 22 |  iter 281 / 351 | time 16[s] | loss 0.54
    | epoch 22 |  iter 301 / 351 | time 17[s] | loss 0.54
    | epoch 22 |  iter 321 / 351 | time 18[s] | loss 0.54
    | epoch 22 |  iter 341 / 351 | time 20[s] | loss 0.54
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1141
    ---
    Q 582+84 
    T 666 
    ☒ 665 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 421 
    ---
    Q 600+257
    T 857 
    ☒ 859 
    ---
    Q 761+292
    T 1053
    ☒ 1050
    ---
    Q 830+597
    T 1427
    ☒ 1424
    ---
    Q 26+838 
    T 864 
    ☒ 865 
    ---
    Q 143+93 
    T 236 
    ☒ 235 
    ---
    val acc 24.840%
    | epoch 23 |  iter 1 / 351 | time 0[s] | loss 0.52
    | epoch 23 |  iter 21 / 351 | time 1[s] | loss 0.53
    | epoch 23 |  iter 41 / 351 | time 2[s] | loss 0.53
    | epoch 23 |  iter 61 / 351 | time 3[s] | loss 0.57
    | epoch 23 |  iter 81 / 351 | time 4[s] | loss 0.57
    | epoch 23 |  iter 101 / 351 | time 5[s] | loss 0.56
    | epoch 23 |  iter 121 / 351 | time 7[s] | loss 0.51
    | epoch 23 |  iter 141 / 351 | time 8[s] | loss 0.53
    | epoch 23 |  iter 161 / 351 | time 9[s] | loss 0.54
    | epoch 23 |  iter 181 / 351 | time 10[s] | loss 0.54
    | epoch 23 |  iter 201 / 351 | time 11[s] | loss 0.53
    | epoch 23 |  iter 221 / 351 | time 13[s] | loss 0.52
    | epoch 23 |  iter 241 / 351 | time 14[s] | loss 0.53
    | epoch 23 |  iter 261 / 351 | time 15[s] | loss 0.55
    | epoch 23 |  iter 281 / 351 | time 16[s] | loss 0.53
    | epoch 23 |  iter 301 / 351 | time 17[s] | loss 0.52
    | epoch 23 |  iter 321 / 351 | time 18[s] | loss 0.52
    | epoch 23 |  iter 341 / 351 | time 20[s] | loss 0.52
    Q 77+85  
    T 162 
    ☑ 162 
    ---
    Q 975+164
    T 1139
    ☒ 1143
    ---
    Q 582+84 
    T 666 
    ☑ 666 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 420 
    ---
    Q 600+257
    T 857 
    ☑ 857 
    ---
    Q 761+292
    T 1053
    ☒ 1055
    ---
    Q 830+597
    T 1427
    ☒ 1424
    ---
    Q 26+838 
    T 864 
    ☒ 862 
    ---
    Q 143+93 
    T 236 
    ☒ 233 
    ---
    val acc 25.740%
    | epoch 24 |  iter 1 / 351 | time 0[s] | loss 0.50
    | epoch 24 |  iter 21 / 351 | time 1[s] | loss 0.51
    | epoch 24 |  iter 41 / 351 | time 2[s] | loss 0.54
    | epoch 24 |  iter 61 / 351 | time 3[s] | loss 0.50
    | epoch 24 |  iter 81 / 351 | time 4[s] | loss 0.51
    | epoch 24 |  iter 101 / 351 | time 5[s] | loss 0.52
    | epoch 24 |  iter 121 / 351 | time 7[s] | loss 0.53
    | epoch 24 |  iter 141 / 351 | time 8[s] | loss 0.51
    | epoch 24 |  iter 161 / 351 | time 9[s] | loss 0.55
    | epoch 24 |  iter 181 / 351 | time 10[s] | loss 0.52
    | epoch 24 |  iter 201 / 351 | time 12[s] | loss 0.51
    | epoch 24 |  iter 221 / 351 | time 13[s] | loss 0.51
    | epoch 24 |  iter 241 / 351 | time 14[s] | loss 0.52
    | epoch 24 |  iter 261 / 351 | time 15[s] | loss 0.52
    | epoch 24 |  iter 281 / 351 | time 17[s] | loss 0.52
    | epoch 24 |  iter 301 / 351 | time 18[s] | loss 0.51
    | epoch 24 |  iter 321 / 351 | time 19[s] | loss 0.51
    | epoch 24 |  iter 341 / 351 | time 20[s] | loss 0.50
    Q 77+85  
    T 162 
    ☒ 165 
    ---
    Q 975+164
    T 1139
    ☒ 1140
    ---
    Q 582+84 
    T 666 
    ☒ 669 
    ---
    Q 8+155  
    T 163 
    ☑ 163 
    ---
    Q 367+55 
    T 422 
    ☒ 423 
    ---
    Q 600+257
    T 857 
    ☑ 857 
    ---
    Q 761+292
    T 1053
    ☒ 1055
    ---
    Q 830+597
    T 1427
    ☑ 1427
    ---
    Q 26+838 
    T 864 
    ☒ 865 
    ---
    Q 143+93 
    T 236 
    ☒ 235 
    ---
    val acc 25.760%
    | epoch 25 |  iter 1 / 351 | time 0[s] | loss 0.49
    | epoch 25 |  iter 21 / 351 | time 1[s] | loss 0.48
    | epoch 25 |  iter 41 / 351 | time 2[s] | loss 0.49
    | epoch 25 |  iter 61 / 351 | time 3[s] | loss 0.49
    | epoch 25 |  iter 81 / 351 | time 4[s] | loss 0.49
    | epoch 25 |  iter 101 / 351 | time 6[s] | loss 0.49
    | epoch 25 |  iter 121 / 351 | time 7[s] | loss 0.50
    | epoch 25 |  iter 141 / 351 | time 8[s] | loss 0.52
    | epoch 25 |  iter 161 / 351 | time 9[s] | loss 0.49
    | epoch 25 |  iter 181 / 351 | time 10[s] | loss 0.49
    | epoch 25 |  iter 201 / 351 | time 11[s] | loss 0.50
    | epoch 25 |  iter 221 / 351 | time 13[s] | loss 0.52
    | epoch 25 |  iter 241 / 351 | time 14[s] | loss 0.55
    | epoch 25 |  iter 261 / 351 | time 15[s] | loss 0.53
    | epoch 25 |  iter 281 / 351 | time 16[s] | loss 0.53
    | epoch 25 |  iter 301 / 351 | time 17[s] | loss 0.53
    | epoch 25 |  iter 321 / 351 | time 18[s] | loss 0.53
    | epoch 25 |  iter 341 / 351 | time 20[s] | loss 0.53
    Q 77+85  
    T 162 
    ☒ 161 
    ---
    Q 975+164
    T 1139
    ☒ 1141
    ---
    Q 582+84 
    T 666 
    ☑ 666 
    ---
    Q 8+155  
    T 163 
    ☒ 164 
    ---
    Q 367+55 
    T 422 
    ☑ 422 
    ---
    Q 600+257
    T 857 
    ☒ 859 
    ---
    Q 761+292
    T 1053
    ☒ 1055
    ---
    Q 830+597
    T 1427
    ☒ 1425
    ---
    Q 26+838 
    T 864 
    ☒ 862 
    ---
    Q 143+93 
    T 236 
    ☒ 238 
    ---
    val acc 28.500%
    

    • 最初は100とか1200とかしか答えられないのにepochが進むにつれて表現力があがって正解できるようになる(かわいい)
    • そしてやはり最後のひと桁が合わない

    %python3
    plt.ylim(0, 1)
    plt.plot(acc_list)
    [<matplotlib.lines.Line2D object at 0x7fb61fa29c18>]
    

    %python3
    acc_list_baseline = acc_list

    改善: 入力を反転させる

    入力を反転させるだけで精度が上がるらしい

    %python3
    (x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
    x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]
    
    model = Seq2Seq(vocab_size, wordvec_size, hidden_size)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)
    
    acc_list_reversed = []
    for epoch in range(max_epoch):
        trainer.fit(x_train, t_train, max_epoch=1,
                    batch_size=batch_size, max_grad=max_grad)
    
        correct_num = 0
        for i in range(len(x_test)):
            question, correct = x_test[[i]], t_test[[i]]
            verbose = i < 10
            correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
        
        acc = float(correct_num) / len(x_test)
        acc_list_reversed.append(acc)
        print('val acc %.3f%%' % (acc * 100))
    | epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.56
    | epoch 1 |  iter 21 / 351 | time 1[s] | loss 2.43
    | epoch 1 |  iter 41 / 351 | time 2[s] | loss 2.07
    | epoch 1 |  iter 61 / 351 | time 3[s] | loss 1.94
    | epoch 1 |  iter 81 / 351 | time 4[s] | loss 1.87
    | epoch 1 |  iter 101 / 351 | time 5[s] | loss 1.81
    | epoch 1 |  iter 121 / 351 | time 6[s] | loss 1.79
    | epoch 1 |  iter 141 / 351 | time 8[s] | loss 1.77
    | epoch 1 |  iter 161 / 351 | time 9[s] | loss 1.76
    | epoch 1 |  iter 181 / 351 | time 10[s] | loss 1.75
    | epoch 1 |  iter 201 / 351 | time 11[s] | loss 1.75
    | epoch 1 |  iter 221 / 351 | time 12[s] | loss 1.74
    | epoch 1 |  iter 241 / 351 | time 13[s] | loss 1.73
    | epoch 1 |  iter 261 / 351 | time 14[s] | loss 1.72
    | epoch 1 |  iter 281 / 351 | time 16[s] | loss 1.71
    | epoch 1 |  iter 301 / 351 | time 17[s] | loss 1.71
    | epoch 1 |  iter 321 / 351 | time 18[s] | loss 1.71
    | epoch 1 |  iter 341 / 351 | time 19[s] | loss 1.70
    Q   58+77
    T 162 
    ☒ 100 
    ---
    Q 461+579
    T 1139
    ☒ 1000
    ---
    Q  48+285
    T 666 
    ☒ 700 
    ---
    Q   551+8
    T 163 
    ☒ 101 
    ---
    Q  55+763
    T 422 
    ☒ 700 
    ---
    Q 752+006
    T 857 
    ☒ 1000
    ---
    Q 292+167
    T 1053
    ☒ 1000
    ---
    Q 795+038
    T 1427
    ☒ 1101
    ---
    Q  838+62
    T 864 
    ☒ 710 
    ---
    Q  39+341
    T 236 
    ☒ 211 
    ---
    val acc 0.360%
    | epoch 2 |  iter 1 / 351 | time 0[s] | loss 1.69
    | epoch 2 |  iter 21 / 351 | time 1[s] | loss 1.67
    | epoch 2 |  iter 41 / 351 | time 2[s] | loss 1.68
    | epoch 2 |  iter 61 / 351 | time 3[s] | loss 1.66
    | epoch 2 |  iter 81 / 351 | time 4[s] | loss 1.65
    | epoch 2 |  iter 101 / 351 | time 5[s] | loss 1.64
    | epoch 2 |  iter 121 / 351 | time 6[s] | loss 1.63
    | epoch 2 |  iter 141 / 351 | time 8[s] | loss 1.62
    | epoch 2 |  iter 161 / 351 | time 9[s] | loss 1.61
    | epoch 2 |  iter 181 / 351 | time 10[s] | loss 1.60
    | epoch 2 |  iter 201 / 351 | time 11[s] | loss 1.59
    | epoch 2 |  iter 221 / 351 | time 12[s] | loss 1.58
    | epoch 2 |  iter 241 / 351 | time 13[s] | loss 1.56
    | epoch 2 |  iter 261 / 351 | time 14[s] | loss 1.55
    | epoch 2 |  iter 281 / 351 | time 16[s] | loss 1.54
    | epoch 2 |  iter 301 / 351 | time 17[s] | loss 1.52
    | epoch 2 |  iter 321 / 351 | time 18[s] | loss 1.50
    | epoch 2 |  iter 341 / 351 | time 19[s] | loss 1.48
    Q   58+77
    T 162 
    ☒ 145 
    ---
    Q 461+579
    T 1139
    ☒ 1004
    ---
    Q  48+285
    T 666 
    ☒ 544 
    ---
    Q   551+8
    T 163 
    ☒ 124 
    ---
    Q  55+763
    T 422 
    ☒ 300 
    ---
    Q 752+006
    T 857 
    ☒ 800 
    ---
    Q 292+167
    T 1053
    ☒ 1000
    ---
    Q 795+038
    T 1427
    ☒ 1574
    ---
    Q  838+62
    T 864 
    ☒ 700 
    ---
    Q  39+341
    T 236 
    ☒ 300 
    ---
    val acc 0.660%
    | epoch 3 |  iter 1 / 351 | time 0[s] | loss 1.45
    | epoch 3 |  iter 21 / 351 | time 1[s] | loss 1.45
    | epoch 3 |  iter 41 / 351 | time 2[s] | loss 1.43
    | epoch 3 |  iter 61 / 351 | time 3[s] | loss 1.41
    | epoch 3 |  iter 81 / 351 | time 4[s] | loss 1.39
    | epoch 3 |  iter 101 / 351 | time 5[s] | loss 1.38
    | epoch 3 |  iter 121 / 351 | time 7[s] | loss 1.36
    | epoch 3 |  iter 141 / 351 | time 8[s] | loss 1.35
    | epoch 3 |  iter 161 / 351 | time 9[s] | loss 1.33
    | epoch 3 |  iter 181 / 351 | time 10[s] | loss 1.31
    | epoch 3 |  iter 201 / 351 | time 11[s] | loss 1.30
    | epoch 3 |  iter 221 / 351 | time 12[s] | loss 1.28
    | epoch 3 |  iter 241 / 351 | time 14[s] | loss 1.27
    | epoch 3 |  iter 261 / 351 | time 15[s] | loss 1.26
    | epoch 3 |  iter 281 / 351 | time 16[s] | loss 1.23
    | epoch 3 |  iter 301 / 351 | time 17[s] | loss 1.23
    | epoch 3 |  iter 321 / 351 | time 18[s] | loss 1.21
    | epoch 3 |  iter 341 / 351 | time 20[s] | loss 1.20
    Q   58+77
    T 162 
    ☒ 158 
    ---
    Q 461+579
    T 1139
    ☒ 1148
    ---
    Q  48+285
    T 666 
    ☒ 664 
    ---
    Q   551+8
    T 163 
    ☒ 164 
    ---
    Q  55+763
    T 422 
    ☒ 408 
    ---
    Q 752+006
    T 857 
    ☒ 878 
    ---
    Q 292+167
    T 1053
    ☒ 1024
    ---
    Q 795+038
    T 1427
    ☒ 1448
    ---
    Q  838+62
    T 864 
    ☒ 875 
    ---
    Q  39+341
    T 236 
    ☒ 238 
    ---
    val acc 3.300%
    | epoch 4 |  iter 1 / 351 | time 0[s] | loss 1.20
    | epoch 4 |  iter 21 / 351 | time 1[s] | loss 1.17
    | epoch 4 |  iter 41 / 351 | time 2[s] | loss 1.14
    | epoch 4 |  iter 61 / 351 | time 3[s] | loss 1.13
    | epoch 4 |  iter 81 / 351 | time 4[s] | loss 1.12
    | epoch 4 |  iter 101 / 351 | time 5[s] | loss 1.10
    | epoch 4 |  iter 121 / 351 | time 7[s] | loss 1.08
    | epoch 4 |  iter 141 / 351 | time 8[s] | loss 1.07
    | epoch 4 |  iter 161 / 351 | time 9[s] | loss 1.05
    | epoch 4 |  iter 181 / 351 | time 10[s] | loss 1.04
    | epoch 4 |  iter 201 / 351 | time 11[s] | loss 1.02
    | epoch 4 |  iter 221 / 351 | time 12[s] | loss 1.01
    | epoch 4 |  iter 241 / 351 | time 14[s] | loss 0.98
    | epoch 4 |  iter 261 / 351 | time 15[s] | loss 0.98
    | epoch 4 |  iter 281 / 351 | time 16[s] | loss 0.97
    | epoch 4 |  iter 301 / 351 | time 17[s] | loss 0.95
    | epoch 4 |  iter 321 / 351 | time 18[s] | loss 0.94
    | epoch 4 |  iter 341 / 351 | time 20[s] | loss 0.93
    Q   58+77
    T 162 
    ☒ 158 
    ---
    Q 461+579
    T 1139
    ☒ 1222
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 156 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 862 
    ---
    Q 292+167
    T 1053
    ☒ 1153
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 862 
    ---
    Q  39+341
    T 236 
    ☒ 238 
    ---
    val acc 7.860%
    | epoch 5 |  iter 1 / 351 | time 0[s] | loss 0.93
    | epoch 5 |  iter 21 / 351 | time 1[s] | loss 0.90
    | epoch 5 |  iter 41 / 351 | time 2[s] | loss 0.90
    | epoch 5 |  iter 61 / 351 | time 3[s] | loss 0.88
    | epoch 5 |  iter 81 / 351 | time 4[s] | loss 0.88
    | epoch 5 |  iter 101 / 351 | time 5[s] | loss 0.86
    | epoch 5 |  iter 121 / 351 | time 7[s] | loss 0.87
    | epoch 5 |  iter 141 / 351 | time 8[s] | loss 0.86
    | epoch 5 |  iter 161 / 351 | time 9[s] | loss 0.84
    | epoch 5 |  iter 181 / 351 | time 10[s] | loss 0.84
    | epoch 5 |  iter 201 / 351 | time 11[s] | loss 0.83
    | epoch 5 |  iter 221 / 351 | time 12[s] | loss 0.82
    | epoch 5 |  iter 241 / 351 | time 14[s] | loss 0.81
    | epoch 5 |  iter 261 / 351 | time 15[s] | loss 0.80
    | epoch 5 |  iter 281 / 351 | time 16[s] | loss 0.80
    | epoch 5 |  iter 301 / 351 | time 17[s] | loss 0.79
    | epoch 5 |  iter 321 / 351 | time 18[s] | loss 0.78
    | epoch 5 |  iter 341 / 351 | time 19[s] | loss 0.78
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1134
    ---
    Q  48+285
    T 666 
    ☒ 662 
    ---
    Q   551+8
    T 163 
    ☒ 156 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 855 
    ---
    Q 292+167
    T 1053
    ☒ 1052
    ---
    Q 795+038
    T 1427
    ☒ 1425
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☒ 231 
    ---
    val acc 12.480%
    | epoch 6 |  iter 1 / 351 | time 0[s] | loss 0.78
    | epoch 6 |  iter 21 / 351 | time 1[s] | loss 0.75
    | epoch 6 |  iter 41 / 351 | time 2[s] | loss 0.75
    | epoch 6 |  iter 61 / 351 | time 3[s] | loss 0.75
    | epoch 6 |  iter 81 / 351 | time 4[s] | loss 0.74
    | epoch 6 |  iter 101 / 351 | time 5[s] | loss 0.74
    | epoch 6 |  iter 121 / 351 | time 7[s] | loss 0.73
    | epoch 6 |  iter 141 / 351 | time 8[s] | loss 0.73
    | epoch 6 |  iter 161 / 351 | time 9[s] | loss 0.72
    | epoch 6 |  iter 181 / 351 | time 10[s] | loss 0.72
    | epoch 6 |  iter 201 / 351 | time 11[s] | loss 0.72
    | epoch 6 |  iter 221 / 351 | time 12[s] | loss 0.72
    | epoch 6 |  iter 241 / 351 | time 14[s] | loss 0.71
    | epoch 6 |  iter 261 / 351 | time 15[s] | loss 0.70
    | epoch 6 |  iter 281 / 351 | time 16[s] | loss 0.69
    | epoch 6 |  iter 301 / 351 | time 17[s] | loss 0.69
    | epoch 6 |  iter 321 / 351 | time 18[s] | loss 0.68
    | epoch 6 |  iter 341 / 351 | time 20[s] | loss 0.68
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1137
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 855 
    ---
    Q 292+167
    T 1053
    ☒ 1054
    ---
    Q 795+038
    T 1427
    ☒ 1426
    ---
    Q  838+62
    T 864 
    ☒ 861 
    ---
    Q  39+341
    T 236 
    ☒ 239 
    ---
    val acc 16.960%
    | epoch 7 |  iter 1 / 351 | time 0[s] | loss 0.65
    | epoch 7 |  iter 21 / 351 | time 1[s] | loss 0.66
    | epoch 7 |  iter 41 / 351 | time 2[s] | loss 0.66
    | epoch 7 |  iter 61 / 351 | time 3[s] | loss 0.65
    | epoch 7 |  iter 81 / 351 | time 4[s] | loss 0.64
    | epoch 7 |  iter 101 / 351 | time 5[s] | loss 0.65
    | epoch 7 |  iter 121 / 351 | time 7[s] | loss 0.65
    | epoch 7 |  iter 141 / 351 | time 8[s] | loss 0.64
    | epoch 7 |  iter 161 / 351 | time 9[s] | loss 0.63
    | epoch 7 |  iter 181 / 351 | time 10[s] | loss 0.63
    | epoch 7 |  iter 201 / 351 | time 11[s] | loss 0.63
    | epoch 7 |  iter 221 / 351 | time 13[s] | loss 0.63
    | epoch 7 |  iter 241 / 351 | time 14[s] | loss 0.63
    | epoch 7 |  iter 261 / 351 | time 15[s] | loss 0.62
    | epoch 7 |  iter 281 / 351 | time 16[s] | loss 0.62
    | epoch 7 |  iter 301 / 351 | time 17[s] | loss 0.61
    | epoch 7 |  iter 321 / 351 | time 18[s] | loss 0.61
    | epoch 7 |  iter 341 / 351 | time 20[s] | loss 0.61
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1140
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☒ 420 
    ---
    Q 752+006
    T 857 
    ☒ 855 
    ---
    Q 292+167
    T 1053
    ☒ 1054
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 863 
    ---
    Q  39+341
    T 236 
    ☒ 239 
    ---
    val acc 16.400%
    | epoch 8 |  iter 1 / 351 | time 0[s] | loss 0.62
    | epoch 8 |  iter 21 / 351 | time 1[s] | loss 0.60
    | epoch 8 |  iter 41 / 351 | time 2[s] | loss 0.59
    | epoch 8 |  iter 61 / 351 | time 3[s] | loss 0.59
    | epoch 8 |  iter 81 / 351 | time 4[s] | loss 0.59
    | epoch 8 |  iter 101 / 351 | time 6[s] | loss 0.58
    | epoch 8 |  iter 121 / 351 | time 7[s] | loss 0.58
    | epoch 8 |  iter 141 / 351 | time 8[s] | loss 0.58
    | epoch 8 |  iter 161 / 351 | time 9[s] | loss 0.58
    | epoch 8 |  iter 181 / 351 | time 10[s] | loss 0.58
    | epoch 8 |  iter 201 / 351 | time 11[s] | loss 0.57
    | epoch 8 |  iter 221 / 351 | time 13[s] | loss 0.57
    | epoch 8 |  iter 241 / 351 | time 14[s] | loss 0.57
    | epoch 8 |  iter 261 / 351 | time 15[s] | loss 0.57
    | epoch 8 |  iter 281 / 351 | time 16[s] | loss 0.57
    | epoch 8 |  iter 301 / 351 | time 17[s] | loss 0.57
    | epoch 8 |  iter 321 / 351 | time 19[s] | loss 0.55
    | epoch 8 |  iter 341 / 351 | time 20[s] | loss 0.55
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1134
    ---
    Q  48+285
    T 666 
    ☒ 668 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☒ 420 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 865 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 22.620%
    | epoch 9 |  iter 1 / 351 | time 0[s] | loss 0.53
    | epoch 9 |  iter 21 / 351 | time 1[s] | loss 0.54
    | epoch 9 |  iter 41 / 351 | time 2[s] | loss 0.54
    | epoch 9 |  iter 61 / 351 | time 3[s] | loss 0.55
    | epoch 9 |  iter 81 / 351 | time 4[s] | loss 0.55
    | epoch 9 |  iter 101 / 351 | time 6[s] | loss 0.54
    | epoch 9 |  iter 121 / 351 | time 7[s] | loss 0.54
    | epoch 9 |  iter 141 / 351 | time 8[s] | loss 0.55
    | epoch 9 |  iter 161 / 351 | time 9[s] | loss 0.55
    | epoch 9 |  iter 181 / 351 | time 10[s] | loss 0.54
    | epoch 9 |  iter 201 / 351 | time 12[s] | loss 0.53
    | epoch 9 |  iter 221 / 351 | time 13[s] | loss 0.53
    | epoch 9 |  iter 241 / 351 | time 14[s] | loss 0.53
    | epoch 9 |  iter 261 / 351 | time 15[s] | loss 0.53
    | epoch 9 |  iter 281 / 351 | time 16[s] | loss 0.54
    | epoch 9 |  iter 301 / 351 | time 17[s] | loss 0.54
    | epoch 9 |  iter 321 / 351 | time 19[s] | loss 0.53
    | epoch 9 |  iter 341 / 351 | time 20[s] | loss 0.52
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1138
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 856 
    ---
    Q 292+167
    T 1053
    ☒ 1152
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☒ 238 
    ---
    val acc 24.120%
    | epoch 10 |  iter 1 / 351 | time 0[s] | loss 0.51
    | epoch 10 |  iter 21 / 351 | time 1[s] | loss 0.52
    | epoch 10 |  iter 41 / 351 | time 2[s] | loss 0.52
    | epoch 10 |  iter 61 / 351 | time 3[s] | loss 0.51
    | epoch 10 |  iter 81 / 351 | time 4[s] | loss 0.50
    | epoch 10 |  iter 101 / 351 | time 5[s] | loss 0.51
    | epoch 10 |  iter 121 / 351 | time 7[s] | loss 0.51
    | epoch 10 |  iter 141 / 351 | time 8[s] | loss 0.51
    | epoch 10 |  iter 161 / 351 | time 9[s] | loss 0.50
    | epoch 10 |  iter 181 / 351 | time 10[s] | loss 0.51
    | epoch 10 |  iter 201 / 351 | time 11[s] | loss 0.51
    | epoch 10 |  iter 221 / 351 | time 13[s] | loss 0.50
    | epoch 10 |  iter 241 / 351 | time 14[s] | loss 0.49
    | epoch 10 |  iter 261 / 351 | time 15[s] | loss 0.49
    | epoch 10 |  iter 281 / 351 | time 16[s] | loss 0.50
    | epoch 10 |  iter 301 / 351 | time 17[s] | loss 0.50
    | epoch 10 |  iter 321 / 351 | time 18[s] | loss 0.49
    | epoch 10 |  iter 341 / 351 | time 20[s] | loss 0.49
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1137
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☒ 420 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1429
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☒ 235 
    ---
    val acc 29.780%
    | epoch 11 |  iter 1 / 351 | time 0[s] | loss 0.46
    | epoch 11 |  iter 21 / 351 | time 1[s] | loss 0.47
    | epoch 11 |  iter 41 / 351 | time 2[s] | loss 0.48
    | epoch 11 |  iter 61 / 351 | time 3[s] | loss 0.49
    | epoch 11 |  iter 81 / 351 | time 4[s] | loss 0.48
    | epoch 11 |  iter 101 / 351 | time 5[s] | loss 0.48
    | epoch 11 |  iter 121 / 351 | time 7[s] | loss 0.48
    | epoch 11 |  iter 141 / 351 | time 8[s] | loss 0.48
    | epoch 11 |  iter 161 / 351 | time 9[s] | loss 0.48
    | epoch 11 |  iter 181 / 351 | time 10[s] | loss 0.48
    | epoch 11 |  iter 201 / 351 | time 11[s] | loss 0.48
    | epoch 11 |  iter 221 / 351 | time 13[s] | loss 0.47
    | epoch 11 |  iter 241 / 351 | time 14[s] | loss 0.46
    | epoch 11 |  iter 261 / 351 | time 15[s] | loss 0.47
    | epoch 11 |  iter 281 / 351 | time 16[s] | loss 0.46
    | epoch 11 |  iter 301 / 351 | time 17[s] | loss 0.47
    | epoch 11 |  iter 321 / 351 | time 18[s] | loss 0.46
    | epoch 11 |  iter 341 / 351 | time 20[s] | loss 0.46
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1138
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☒ 420 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 866 
    ---
    Q  39+341
    T 236 
    ☒ 233 
    ---
    val acc 26.080%
    | epoch 12 |  iter 1 / 351 | time 0[s] | loss 0.47
    | epoch 12 |  iter 21 / 351 | time 1[s] | loss 0.46
    | epoch 12 |  iter 41 / 351 | time 2[s] | loss 0.46
    | epoch 12 |  iter 61 / 351 | time 3[s] | loss 0.45
    | epoch 12 |  iter 81 / 351 | time 4[s] | loss 0.45
    | epoch 12 |  iter 101 / 351 | time 5[s] | loss 0.46
    | epoch 12 |  iter 121 / 351 | time 7[s] | loss 0.45
    | epoch 12 |  iter 141 / 351 | time 8[s] | loss 0.45
    | epoch 12 |  iter 161 / 351 | time 9[s] | loss 0.45
    | epoch 12 |  iter 181 / 351 | time 10[s] | loss 0.45
    | epoch 12 |  iter 201 / 351 | time 11[s] | loss 0.45
    | epoch 12 |  iter 221 / 351 | time 13[s] | loss 0.45
    | epoch 12 |  iter 241 / 351 | time 14[s] | loss 0.48
    | epoch 12 |  iter 261 / 351 | time 15[s] | loss 0.47
    | epoch 12 |  iter 281 / 351 | time 16[s] | loss 0.45
    | epoch 12 |  iter 301 / 351 | time 17[s] | loss 0.44
    | epoch 12 |  iter 321 / 351 | time 18[s] | loss 0.43
    | epoch 12 |  iter 341 / 351 | time 20[s] | loss 0.43
    Q   58+77
    T 162 
    ☒ 161 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☒ 420 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 865 
    ---
    Q  39+341
    T 236 
    ☒ 235 
    ---
    val acc 28.100%
    | epoch 13 |  iter 1 / 351 | time 0[s] | loss 0.46
    | epoch 13 |  iter 21 / 351 | time 1[s] | loss 0.43
    | epoch 13 |  iter 41 / 351 | time 2[s] | loss 0.43
    | epoch 13 |  iter 61 / 351 | time 3[s] | loss 0.43
    | epoch 13 |  iter 81 / 351 | time 4[s] | loss 0.43
    | epoch 13 |  iter 101 / 351 | time 5[s] | loss 0.45
    | epoch 13 |  iter 121 / 351 | time 7[s] | loss 0.44
    | epoch 13 |  iter 141 / 351 | time 8[s] | loss 0.44
    | epoch 13 |  iter 161 / 351 | time 9[s] | loss 0.44
    | epoch 13 |  iter 181 / 351 | time 10[s] | loss 0.43
    | epoch 13 |  iter 201 / 351 | time 11[s] | loss 0.42
    | epoch 13 |  iter 221 / 351 | time 13[s] | loss 0.42
    | epoch 13 |  iter 241 / 351 | time 14[s] | loss 0.42
    | epoch 13 |  iter 261 / 351 | time 15[s] | loss 0.43
    | epoch 13 |  iter 281 / 351 | time 16[s] | loss 0.43
    | epoch 13 |  iter 301 / 351 | time 17[s] | loss 0.43
    | epoch 13 |  iter 321 / 351 | time 18[s] | loss 0.44
    | epoch 13 |  iter 341 / 351 | time 20[s] | loss 0.43
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1141
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 160 
    ---
    Q  55+763
    T 422 
    ☒ 424 
    ---
    Q 752+006
    T 857 
    ☒ 859 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1429
    ---
    Q  838+62
    T 864 
    ☒ 865 
    ---
    Q  39+341
    T 236 
    ☒ 237 
    ---
    val acc 33.320%
    | epoch 14 |  iter 1 / 351 | time 0[s] | loss 0.44
    | epoch 14 |  iter 21 / 351 | time 1[s] | loss 0.42
    | epoch 14 |  iter 41 / 351 | time 2[s] | loss 0.42
    | epoch 14 |  iter 61 / 351 | time 3[s] | loss 0.43
    | epoch 14 |  iter 81 / 351 | time 4[s] | loss 0.43
    | epoch 14 |  iter 101 / 351 | time 5[s] | loss 0.41
    | epoch 14 |  iter 121 / 351 | time 7[s] | loss 0.40
    | epoch 14 |  iter 141 / 351 | time 8[s] | loss 0.41
    | epoch 14 |  iter 161 / 351 | time 9[s] | loss 0.41
    | epoch 14 |  iter 181 / 351 | time 10[s] | loss 0.42
    | epoch 14 |  iter 201 / 351 | time 11[s] | loss 0.44
    | epoch 14 |  iter 221 / 351 | time 13[s] | loss 0.43
    | epoch 14 |  iter 241 / 351 | time 14[s] | loss 0.42
    | epoch 14 |  iter 261 / 351 | time 15[s] | loss 0.41
    | epoch 14 |  iter 281 / 351 | time 16[s] | loss 0.40
    | epoch 14 |  iter 301 / 351 | time 17[s] | loss 0.40
    | epoch 14 |  iter 321 / 351 | time 18[s] | loss 0.40
    | epoch 14 |  iter 341 / 351 | time 20[s] | loss 0.40
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1138
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 856 
    ---
    Q 292+167
    T 1053
    ☒ 1052
    ---
    Q 795+038
    T 1427
    ☒ 1426
    ---
    Q  838+62
    T 864 
    ☒ 862 
    ---
    Q  39+341
    T 236 
    ☒ 235 
    ---
    val acc 35.180%
    | epoch 15 |  iter 1 / 351 | time 0[s] | loss 0.40
    | epoch 15 |  iter 21 / 351 | time 1[s] | loss 0.40
    | epoch 15 |  iter 41 / 351 | time 2[s] | loss 0.42
    | epoch 15 |  iter 61 / 351 | time 3[s] | loss 0.41
    | epoch 15 |  iter 81 / 351 | time 4[s] | loss 0.40
    | epoch 15 |  iter 101 / 351 | time 5[s] | loss 0.40
    | epoch 15 |  iter 121 / 351 | time 7[s] | loss 0.39
    | epoch 15 |  iter 141 / 351 | time 8[s] | loss 0.39
    | epoch 15 |  iter 161 / 351 | time 9[s] | loss 0.40
    | epoch 15 |  iter 181 / 351 | time 10[s] | loss 0.41
    | epoch 15 |  iter 201 / 351 | time 11[s] | loss 0.41
    | epoch 15 |  iter 221 / 351 | time 13[s] | loss 0.39
    | epoch 15 |  iter 241 / 351 | time 14[s] | loss 0.39
    | epoch 15 |  iter 261 / 351 | time 15[s] | loss 0.40
    | epoch 15 |  iter 281 / 351 | time 16[s] | loss 0.41
    | epoch 15 |  iter 301 / 351 | time 17[s] | loss 0.39
    | epoch 15 |  iter 321 / 351 | time 18[s] | loss 0.39
    | epoch 15 |  iter 341 / 351 | time 20[s] | loss 0.38
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1138
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☒ 164 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☒ 1054
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 866 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 39.040%
    | epoch 16 |  iter 1 / 351 | time 0[s] | loss 0.39
    | epoch 16 |  iter 21 / 351 | time 1[s] | loss 0.37
    | epoch 16 |  iter 41 / 351 | time 2[s] | loss 0.37
    | epoch 16 |  iter 61 / 351 | time 3[s] | loss 0.38
    | epoch 16 |  iter 81 / 351 | time 4[s] | loss 0.39
    | epoch 16 |  iter 101 / 351 | time 5[s] | loss 0.38
    | epoch 16 |  iter 121 / 351 | time 7[s] | loss 0.38
    | epoch 16 |  iter 141 / 351 | time 8[s] | loss 0.37
    | epoch 16 |  iter 161 / 351 | time 9[s] | loss 0.40
    | epoch 16 |  iter 181 / 351 | time 10[s] | loss 0.39
    | epoch 16 |  iter 201 / 351 | time 11[s] | loss 0.38
    | epoch 16 |  iter 221 / 351 | time 12[s] | loss 0.41
    | epoch 16 |  iter 241 / 351 | time 14[s] | loss 0.41
    | epoch 16 |  iter 261 / 351 | time 15[s] | loss 0.40
    | epoch 16 |  iter 281 / 351 | time 16[s] | loss 0.40
    | epoch 16 |  iter 301 / 351 | time 17[s] | loss 0.38
    | epoch 16 |  iter 321 / 351 | time 18[s] | loss 0.38
    | epoch 16 |  iter 341 / 351 | time 19[s] | loss 0.37
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1138
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☒ 1054
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 865 
    ---
    Q  39+341
    T 236 
    ☒ 237 
    ---
    val acc 43.220%
    | epoch 17 |  iter 1 / 351 | time 0[s] | loss 0.35
    | epoch 17 |  iter 21 / 351 | time 1[s] | loss 0.36
    | epoch 17 |  iter 41 / 351 | time 2[s] | loss 0.37
    | epoch 17 |  iter 61 / 351 | time 3[s] | loss 0.36
    | epoch 17 |  iter 81 / 351 | time 4[s] | loss 0.36
    | epoch 17 |  iter 101 / 351 | time 5[s] | loss 0.37
    | epoch 17 |  iter 121 / 351 | time 7[s] | loss 0.37
    | epoch 17 |  iter 141 / 351 | time 8[s] | loss 0.37
    | epoch 17 |  iter 161 / 351 | time 9[s] | loss 0.37
    | epoch 17 |  iter 181 / 351 | time 10[s] | loss 0.38
    | epoch 17 |  iter 201 / 351 | time 11[s] | loss 0.38
    | epoch 17 |  iter 221 / 351 | time 13[s] | loss 0.37
    | epoch 17 |  iter 241 / 351 | time 14[s] | loss 0.37
    | epoch 17 |  iter 261 / 351 | time 15[s] | loss 0.37
    | epoch 17 |  iter 281 / 351 | time 16[s] | loss 0.37
    | epoch 17 |  iter 301 / 351 | time 17[s] | loss 0.37
    | epoch 17 |  iter 321 / 351 | time 19[s] | loss 0.37
    | epoch 17 |  iter 341 / 351 | time 20[s] | loss 0.37
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 164 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 856 
    ---
    Q 292+167
    T 1053
    ☒ 1052
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 41.100%
    | epoch 18 |  iter 1 / 351 | time 0[s] | loss 0.36
    | epoch 18 |  iter 21 / 351 | time 1[s] | loss 0.37
    | epoch 18 |  iter 41 / 351 | time 2[s] | loss 0.37
    | epoch 18 |  iter 61 / 351 | time 3[s] | loss 0.37
    | epoch 18 |  iter 81 / 351 | time 4[s] | loss 0.35
    | epoch 18 |  iter 101 / 351 | time 5[s] | loss 0.35
    | epoch 18 |  iter 121 / 351 | time 7[s] | loss 0.37
    | epoch 18 |  iter 141 / 351 | time 8[s] | loss 0.36
    | epoch 18 |  iter 161 / 351 | time 9[s] | loss 0.35
    | epoch 18 |  iter 181 / 351 | time 10[s] | loss 0.36
    | epoch 18 |  iter 201 / 351 | time 11[s] | loss 0.37
    | epoch 18 |  iter 221 / 351 | time 13[s] | loss 0.38
    | epoch 18 |  iter 241 / 351 | time 14[s] | loss 0.38
    | epoch 18 |  iter 261 / 351 | time 15[s] | loss 0.36
    | epoch 18 |  iter 281 / 351 | time 16[s] | loss 0.36
    | epoch 18 |  iter 301 / 351 | time 17[s] | loss 0.36
    | epoch 18 |  iter 321 / 351 | time 18[s] | loss 0.35
    | epoch 18 |  iter 341 / 351 | time 20[s] | loss 0.34
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1141
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 38.580%
    | epoch 19 |  iter 1 / 351 | time 0[s] | loss 0.36
    | epoch 19 |  iter 21 / 351 | time 1[s] | loss 0.35
    | epoch 19 |  iter 41 / 351 | time 2[s] | loss 0.35
    | epoch 19 |  iter 61 / 351 | time 3[s] | loss 0.35
    | epoch 19 |  iter 81 / 351 | time 4[s] | loss 0.35
    | epoch 19 |  iter 101 / 351 | time 6[s] | loss 0.34
    | epoch 19 |  iter 121 / 351 | time 7[s] | loss 0.35
    | epoch 19 |  iter 141 / 351 | time 8[s] | loss 0.35
    | epoch 19 |  iter 161 / 351 | time 9[s] | loss 0.35
    | epoch 19 |  iter 181 / 351 | time 10[s] | loss 0.35
    | epoch 19 |  iter 201 / 351 | time 11[s] | loss 0.34
    | epoch 19 |  iter 221 / 351 | time 13[s] | loss 0.35
    | epoch 19 |  iter 241 / 351 | time 14[s] | loss 0.36
    | epoch 19 |  iter 261 / 351 | time 15[s] | loss 0.37
    | epoch 19 |  iter 281 / 351 | time 16[s] | loss 0.36
    | epoch 19 |  iter 301 / 351 | time 17[s] | loss 0.35
    | epoch 19 |  iter 321 / 351 | time 19[s] | loss 0.35
    | epoch 19 |  iter 341 / 351 | time 20[s] | loss 0.35
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☒ 1138
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☒ 164 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☒ 1052
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☒ 862 
    ---
    Q  39+341
    T 236 
    ☒ 235 
    ---
    val acc 46.720%
    | epoch 20 |  iter 1 / 351 | time 0[s] | loss 0.32
    | epoch 20 |  iter 21 / 351 | time 1[s] | loss 0.34
    | epoch 20 |  iter 41 / 351 | time 2[s] | loss 0.36
    | epoch 20 |  iter 61 / 351 | time 3[s] | loss 0.36
    | epoch 20 |  iter 81 / 351 | time 4[s] | loss 0.35
    | epoch 20 |  iter 101 / 351 | time 5[s] | loss 0.35
    | epoch 20 |  iter 121 / 351 | time 7[s] | loss 0.36
    | epoch 20 |  iter 141 / 351 | time 8[s] | loss 0.35
    | epoch 20 |  iter 161 / 351 | time 9[s] | loss 0.34
    | epoch 20 |  iter 181 / 351 | time 10[s] | loss 0.34
    | epoch 20 |  iter 201 / 351 | time 11[s] | loss 0.33
    | epoch 20 |  iter 221 / 351 | time 13[s] | loss 0.33
    | epoch 20 |  iter 241 / 351 | time 14[s] | loss 0.33
    | epoch 20 |  iter 261 / 351 | time 15[s] | loss 0.34
    | epoch 20 |  iter 281 / 351 | time 16[s] | loss 0.34
    | epoch 20 |  iter 301 / 351 | time 17[s] | loss 0.34
    | epoch 20 |  iter 321 / 351 | time 18[s] | loss 0.34
    | epoch 20 |  iter 341 / 351 | time 20[s] | loss 0.34
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☒ 1141
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 162 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 856 
    ---
    Q 292+167
    T 1053
    ☒ 1054
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☒ 865 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 48.480%
    | epoch 21 |  iter 1 / 351 | time 0[s] | loss 0.32
    | epoch 21 |  iter 21 / 351 | time 1[s] | loss 0.33
    | epoch 21 |  iter 41 / 351 | time 2[s] | loss 0.34
    | epoch 21 |  iter 61 / 351 | time 3[s] | loss 0.34
    | epoch 21 |  iter 81 / 351 | time 4[s] | loss 0.33
    | epoch 21 |  iter 101 / 351 | time 6[s] | loss 0.33
    | epoch 21 |  iter 121 / 351 | time 7[s] | loss 0.33
    | epoch 21 |  iter 141 / 351 | time 8[s] | loss 0.33
    | epoch 21 |  iter 161 / 351 | time 9[s] | loss 0.33
    | epoch 21 |  iter 181 / 351 | time 10[s] | loss 0.33
    | epoch 21 |  iter 201 / 351 | time 11[s] | loss 0.32
    | epoch 21 |  iter 221 / 351 | time 13[s] | loss 0.33
    | epoch 21 |  iter 241 / 351 | time 14[s] | loss 0.33
    | epoch 21 |  iter 261 / 351 | time 15[s] | loss 0.33
    | epoch 21 |  iter 281 / 351 | time 16[s] | loss 0.32
    | epoch 21 |  iter 301 / 351 | time 17[s] | loss 0.32
    | epoch 21 |  iter 321 / 351 | time 19[s] | loss 0.33
    | epoch 21 |  iter 341 / 351 | time 20[s] | loss 0.33
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1140
    ---
    Q  48+285
    T 666 
    ☒ 665 
    ---
    Q   551+8
    T 163 
    ☒ 164 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1429
    ---
    Q  838+62
    T 864 
    ☒ 865 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 44.940%
    | epoch 22 |  iter 1 / 351 | time 0[s] | loss 0.31
    | epoch 22 |  iter 21 / 351 | time 1[s] | loss 0.32
    | epoch 22 |  iter 41 / 351 | time 2[s] | loss 0.32
    | epoch 22 |  iter 61 / 351 | time 3[s] | loss 0.34
    | epoch 22 |  iter 81 / 351 | time 4[s] | loss 0.32
    | epoch 22 |  iter 101 / 351 | time 6[s] | loss 0.33
    | epoch 22 |  iter 121 / 351 | time 7[s] | loss 0.33
    | epoch 22 |  iter 141 / 351 | time 8[s] | loss 0.34
    | epoch 22 |  iter 161 / 351 | time 9[s] | loss 0.34
    | epoch 22 |  iter 181 / 351 | time 10[s] | loss 0.34
    | epoch 22 |  iter 201 / 351 | time 11[s] | loss 0.32
    | epoch 22 |  iter 221 / 351 | time 13[s] | loss 0.31
    | epoch 22 |  iter 241 / 351 | time 14[s] | loss 0.32
    | epoch 22 |  iter 261 / 351 | time 15[s] | loss 0.31
    | epoch 22 |  iter 281 / 351 | time 16[s] | loss 0.32
    | epoch 22 |  iter 301 / 351 | time 17[s] | loss 0.33
    | epoch 22 |  iter 321 / 351 | time 19[s] | loss 0.33
    | epoch 22 |  iter 341 / 351 | time 20[s] | loss 0.33
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☒ 667 
    ---
    Q   551+8
    T 163 
    ☒ 162 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☒ 1052
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☒ 865 
    ---
    Q  39+341
    T 236 
    ☒ 235 
    ---
    val acc 44.800%
    | epoch 23 |  iter 1 / 351 | time 0[s] | loss 0.32
    | epoch 23 |  iter 21 / 351 | time 1[s] | loss 0.31
    | epoch 23 |  iter 41 / 351 | time 2[s] | loss 0.32
    | epoch 23 |  iter 61 / 351 | time 3[s] | loss 0.31
    | epoch 23 |  iter 81 / 351 | time 4[s] | loss 0.31
    | epoch 23 |  iter 101 / 351 | time 5[s] | loss 0.32
    | epoch 23 |  iter 121 / 351 | time 7[s] | loss 0.32
    | epoch 23 |  iter 141 / 351 | time 8[s] | loss 0.33
    | epoch 23 |  iter 161 / 351 | time 9[s] | loss 0.32
    | epoch 23 |  iter 181 / 351 | time 10[s] | loss 0.32
    | epoch 23 |  iter 201 / 351 | time 11[s] | loss 0.33
    | epoch 23 |  iter 221 / 351 | time 13[s] | loss 0.33
    | epoch 23 |  iter 241 / 351 | time 14[s] | loss 0.33
    | epoch 23 |  iter 261 / 351 | time 15[s] | loss 0.32
    | epoch 23 |  iter 281 / 351 | time 16[s] | loss 0.32
    | epoch 23 |  iter 301 / 351 | time 17[s] | loss 0.31
    | epoch 23 |  iter 321 / 351 | time 19[s] | loss 0.31
    | epoch 23 |  iter 341 / 351 | time 20[s] | loss 0.31
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☒ 1140
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 162 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1426
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☒ 235 
    ---
    val acc 41.060%
    | epoch 24 |  iter 1 / 351 | time 0[s] | loss 0.34
    | epoch 24 |  iter 21 / 351 | time 1[s] | loss 0.32
    | epoch 24 |  iter 41 / 351 | time 2[s] | loss 0.30
    | epoch 24 |  iter 61 / 351 | time 3[s] | loss 0.30
    | epoch 24 |  iter 81 / 351 | time 4[s] | loss 0.30
    | epoch 24 |  iter 101 / 351 | time 5[s] | loss 0.30
    | epoch 24 |  iter 121 / 351 | time 7[s] | loss 0.31
    | epoch 24 |  iter 141 / 351 | time 8[s] | loss 0.32
    | epoch 24 |  iter 161 / 351 | time 9[s] | loss 0.32
    | epoch 24 |  iter 181 / 351 | time 10[s] | loss 0.32
    | epoch 24 |  iter 201 / 351 | time 11[s] | loss 0.31
    | epoch 24 |  iter 221 / 351 | time 13[s] | loss 0.32
    | epoch 24 |  iter 241 / 351 | time 14[s] | loss 0.32
    | epoch 24 |  iter 261 / 351 | time 15[s] | loss 0.31
    | epoch 24 |  iter 281 / 351 | time 16[s] | loss 0.31
    | epoch 24 |  iter 301 / 351 | time 17[s] | loss 0.31
    | epoch 24 |  iter 321 / 351 | time 18[s] | loss 0.30
    | epoch 24 |  iter 341 / 351 | time 20[s] | loss 0.30
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1140
    ---
    Q  48+285
    T 666 
    ☒ 665 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☒ 1054
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 45.180%
    | epoch 25 |  iter 1 / 351 | time 0[s] | loss 0.34
    | epoch 25 |  iter 21 / 351 | time 1[s] | loss 0.29
    | epoch 25 |  iter 41 / 351 | time 2[s] | loss 0.30
    | epoch 25 |  iter 61 / 351 | time 3[s] | loss 0.30
    | epoch 25 |  iter 81 / 351 | time 4[s] | loss 0.30
    | epoch 25 |  iter 101 / 351 | time 6[s] | loss 0.29
    | epoch 25 |  iter 121 / 351 | time 7[s] | loss 0.31
    | epoch 25 |  iter 141 / 351 | time 8[s] | loss 0.32
    | epoch 25 |  iter 161 / 351 | time 9[s] | loss 0.32
    | epoch 25 |  iter 181 / 351 | time 10[s] | loss 0.31
    | epoch 25 |  iter 201 / 351 | time 11[s] | loss 0.32
    | epoch 25 |  iter 221 / 351 | time 13[s] | loss 0.30
    | epoch 25 |  iter 241 / 351 | time 14[s] | loss 0.29
    | epoch 25 |  iter 261 / 351 | time 15[s] | loss 0.30
    | epoch 25 |  iter 281 / 351 | time 16[s] | loss 0.30
    | epoch 25 |  iter 301 / 351 | time 17[s] | loss 0.30
    | epoch 25 |  iter 321 / 351 | time 19[s] | loss 0.30
    | epoch 25 |  iter 341 / 351 | time 20[s] | loss 0.30
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☒ 1141
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☒ 162 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☒ 1054
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☒ 862 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 51.620%
    

    %python3
    plt.ylim(0, 1)
    plt.plot(acc_list_baseline)
    plt.plot(acc_list_reversed)
    plt.legend(labels=['baseline', 'reversed input'])
    plt.show()

    • 入力を反転させるだけで正解率が倍くらい上がる
    • 1桁目が正解しにくかったのと関係ありそう
    • 学習にかかった時間は12分30秒ほど(CPUのみ)

    Peeky

    覗き見の実装

    %python3
    class PeekyDecoder:
        def __init__(self, vocab_size, wordvec_size, hidden_size):
            V, D, H = vocab_size, wordvec_size, hidden_size
            rn = np.random.randn
            
            embed_W = (rn(V, D) / 100).astype('f')
            lstm_Wx = (rn(H+D, 4*H) / np.sqrt(H+D)).astype('f')
            lstm_Wh = (rn(H, 4*H) / np.sqrt(H)).astype('f')
            lstm_b = np.zeros(4*H).astype('f')
            affine_W = (rn(H+H, V) / np.sqrt(H+H)).astype('f')
            affine_b = np.zeros(V).astype('f')
            
            self.embed = TimeEmbedding(embed_W)
            self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
            self.affine = TimeAffine(affine_W, affine_b)
            
            self.params, self.grads = [], []
            for layer in (self.embed, self.lstm, self.affine):
                self.params += layer.params
                self.grads += layer.grads
            self.cache = None
        
        def forward(self, xs, h):
            N, T = xs.shape
            N, H = h.shape
            
            self.lstm.set_state(h)
            
            out = self.embed.forward(xs)
            hs = np.repeat(h, T, axis=0).reshape(N, T, H)
            out = np.concatenate((hs, out), axis=2)
            
            out = self.lstm.forward(out)
            out = np.concatenate((hs, out), axis=2)
            
            score = self.affine.forward(out)
            self.cache = H
            return score
        
        def backward(self, dscore):
            H = self.cache
            
            dout = self.affine.backward(dscore)
            dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
            dout = self.lstm.backward(dout)
            dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
            self.embed.backward(dembed)
            
            dhs = dhs0 + dhs1
            dh = self.lstm.dh + np.sum(dhs, axis=1)
            return dh
        
        def generate(self, h, start_id, sample_size):
            sampled = []
            char_id = start_id
            self.lstm.set_state(h)
            
            H = h.shape[1]
            peeky_h = h.reshape(1, 1, H)
            for _ in range(sample_size):
                x = np.array([char_id]).reshape((1, 1))
                out = self.embed.forward(x)
                
                out = np.concatenate((peeky_h, out), axis=2)
                out = self.lstm.forward(out)
                out = np.concatenate((peeky_h, out), axis=2)
                score = self.affine.forward(out)
                
                char_id = np.argmax(score.flatten())
                sampled.append(char_id)
            
            return sampled

    %python3
    class PeekySeq2Seq(Seq2Seq):
        def __init__(self, vocab_size, wordvec_size, hidden_size):
            V, D, H = vocab_size, wordvec_size, hidden_size
            
            self.encoder = Encoder(V, D, H)
            self.decoder = PeekyDecoder(V, D, H)
            self.softmax = TimeSoftmaxWithLoss()
            
            self.params = self.encoder.params + self.decoder.params
            self.grads = self.encoder.grads + self.decoder.grads

    %python3
    (x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
    x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]
    
    # model = Seq2Seq(vocab_size, wordvec_size, hidden_size)
    model = PeekySeq2Seq(vocab_size, wordvec_size, hidden_size)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)
    
    acc_list_peeky = []
    for epoch in range(max_epoch):
        trainer.fit(x_train, t_train, max_epoch=1,
                    batch_size=batch_size, max_grad=max_grad)
    
        correct_num = 0
        for i in range(len(x_test)):
            question, correct = x_test[[i]], t_test[[i]]
            verbose = i < 10
            correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
        
        acc = float(correct_num) / len(x_test)
        acc_list_peeky.append(acc)
        print('val acc %.3f%%' % (acc * 100))
    | epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.57
    | epoch 1 |  iter 21 / 351 | time 1[s] | loss 2.48
    | epoch 1 |  iter 41 / 351 | time 2[s] | loss 2.20
    | epoch 1 |  iter 61 / 351 | time 3[s] | loss 1.99
    | epoch 1 |  iter 81 / 351 | time 4[s] | loss 1.89
    | epoch 1 |  iter 101 / 351 | time 5[s] | loss 1.82
    | epoch 1 |  iter 121 / 351 | time 7[s] | loss 1.82
    | epoch 1 |  iter 141 / 351 | time 8[s] | loss 1.80
    | epoch 1 |  iter 161 / 351 | time 9[s] | loss 1.79
    | epoch 1 |  iter 181 / 351 | time 10[s] | loss 1.78
    | epoch 1 |  iter 201 / 351 | time 11[s] | loss 1.77
    | epoch 1 |  iter 221 / 351 | time 13[s] | loss 1.76
    | epoch 1 |  iter 241 / 351 | time 14[s] | loss 1.76
    | epoch 1 |  iter 261 / 351 | time 15[s] | loss 1.75
    | epoch 1 |  iter 281 / 351 | time 16[s] | loss 1.74
    | epoch 1 |  iter 301 / 351 | time 17[s] | loss 1.74
    | epoch 1 |  iter 321 / 351 | time 19[s] | loss 1.73
    | epoch 1 |  iter 341 / 351 | time 20[s] | loss 1.73
    Q   58+77
    T 162 
    ☒ 100 
    ---
    Q 461+579
    T 1139
    ☒ 1013
    ---
    Q  48+285
    T 666 
    ☒ 102 
    ---
    Q   551+8
    T 163 
    ☒ 100 
    ---
    Q  55+763
    T 422 
    ☒ 1023
    ---
    Q 752+006
    T 857 
    ☒ 1023
    ---
    Q 292+167
    T 1053
    ☒ 1023
    ---
    Q 795+038
    T 1427
    ☒ 1111
    ---
    Q  838+62
    T 864 
    ☒ 102 
    ---
    Q  39+341
    T 236 
    ☒ 102 
    ---
    val acc 0.280%
    | epoch 2 |  iter 1 / 351 | time 0[s] | loss 1.71
    | epoch 2 |  iter 21 / 351 | time 1[s] | loss 1.71
    | epoch 2 |  iter 41 / 351 | time 2[s] | loss 1.71
    | epoch 2 |  iter 61 / 351 | time 3[s] | loss 1.71
    | epoch 2 |  iter 81 / 351 | time 4[s] | loss 1.70
    | epoch 2 |  iter 101 / 351 | time 6[s] | loss 1.68
    | epoch 2 |  iter 121 / 351 | time 7[s] | loss 1.69
    | epoch 2 |  iter 141 / 351 | time 8[s] | loss 1.68
    | epoch 2 |  iter 161 / 351 | time 9[s] | loss 1.67
    | epoch 2 |  iter 181 / 351 | time 10[s] | loss 1.67
    | epoch 2 |  iter 201 / 351 | time 12[s] | loss 1.65
    | epoch 2 |  iter 221 / 351 | time 13[s] | loss 1.65
    | epoch 2 |  iter 241 / 351 | time 14[s] | loss 1.65
    | epoch 2 |  iter 261 / 351 | time 15[s] | loss 1.63
    | epoch 2 |  iter 281 / 351 | time 17[s] | loss 1.62
    | epoch 2 |  iter 301 / 351 | time 18[s] | loss 1.61
    | epoch 2 |  iter 321 / 351 | time 19[s] | loss 1.61
    | epoch 2 |  iter 341 / 351 | time 20[s] | loss 1.60
    Q   58+77
    T 162 
    ☒ 100 
    ---
    Q 461+579
    T 1139
    ☒ 1200
    ---
    Q  48+285
    T 666 
    ☒ 690 
    ---
    Q   551+8
    T 163 
    ☒ 100 
    ---
    Q  55+763
    T 422 
    ☒ 690 
    ---
    Q 752+006
    T 857 
    ☒ 999 
    ---
    Q 292+167
    T 1053
    ☒ 1029
    ---
    Q 795+038
    T 1427
    ☒ 1240
    ---
    Q  838+62
    T 864 
    ☒ 792 
    ---
    Q  39+341
    T 236 
    ☒ 290 
    ---
    val acc 0.400%
    | epoch 3 |  iter 1 / 351 | time 0[s] | loss 1.58
    | epoch 3 |  iter 21 / 351 | time 1[s] | loss 1.59
    | epoch 3 |  iter 41 / 351 | time 2[s] | loss 1.58
    | epoch 3 |  iter 61 / 351 | time 3[s] | loss 1.56
    | epoch 3 |  iter 81 / 351 | time 5[s] | loss 1.55
    | epoch 3 |  iter 101 / 351 | time 6[s] | loss 1.53
    | epoch 3 |  iter 121 / 351 | time 7[s] | loss 1.51
    | epoch 3 |  iter 141 / 351 | time 8[s] | loss 1.50
    | epoch 3 |  iter 161 / 351 | time 9[s] | loss 1.49
    | epoch 3 |  iter 181 / 351 | time 11[s] | loss 1.47
    | epoch 3 |  iter 201 / 351 | time 12[s] | loss 1.46
    | epoch 3 |  iter 221 / 351 | time 13[s] | loss 1.43
    | epoch 3 |  iter 241 / 351 | time 14[s] | loss 1.42
    | epoch 3 |  iter 261 / 351 | time 16[s] | loss 1.41
    | epoch 3 |  iter 281 / 351 | time 17[s] | loss 1.39
    | epoch 3 |  iter 301 / 351 | time 18[s] | loss 1.37
    | epoch 3 |  iter 321 / 351 | time 19[s] | loss 1.36
    | epoch 3 |  iter 341 / 351 | time 21[s] | loss 1.35
    Q   58+77
    T 162 
    ☒ 154 
    ---
    Q 461+579
    T 1139
    ☒ 1033
    ---
    Q  48+285
    T 666 
    ☒ 644 
    ---
    Q   551+8
    T 163 
    ☒ 161 
    ---
    Q  55+763
    T 422 
    ☒ 433 
    ---
    Q 752+006
    T 857 
    ☒ 818 
    ---
    Q 292+167
    T 1053
    ☒ 1018
    ---
    Q 795+038
    T 1427
    ☒ 1344
    ---
    Q  838+62
    T 864 
    ☒ 834 
    ---
    Q  39+341
    T 236 
    ☒ 211 
    ---
    val acc 1.600%
    | epoch 4 |  iter 1 / 351 | time 0[s] | loss 1.32
    | epoch 4 |  iter 21 / 351 | time 1[s] | loss 1.32
    | epoch 4 |  iter 41 / 351 | time 2[s] | loss 1.30
    | epoch 4 |  iter 61 / 351 | time 3[s] | loss 1.30
    | epoch 4 |  iter 81 / 351 | time 5[s] | loss 1.28
    | epoch 4 |  iter 101 / 351 | time 6[s] | loss 1.27
    | epoch 4 |  iter 121 / 351 | time 7[s] | loss 1.25
    | epoch 4 |  iter 141 / 351 | time 8[s] | loss 1.24
    | epoch 4 |  iter 161 / 351 | time 10[s] | loss 1.22
    | epoch 4 |  iter 181 / 351 | time 11[s] | loss 1.21
    | epoch 4 |  iter 201 / 351 | time 12[s] | loss 1.20
    | epoch 4 |  iter 221 / 351 | time 13[s] | loss 1.20
    | epoch 4 |  iter 241 / 351 | time 15[s] | loss 1.17
    | epoch 4 |  iter 261 / 351 | time 16[s] | loss 1.16
    | epoch 4 |  iter 281 / 351 | time 17[s] | loss 1.14
    | epoch 4 |  iter 301 / 351 | time 18[s] | loss 1.12
    | epoch 4 |  iter 321 / 351 | time 20[s] | loss 1.11
    | epoch 4 |  iter 341 / 351 | time 21[s] | loss 1.10
    Q   58+77
    T 162 
    ☒ 158 
    ---
    Q 461+579
    T 1139
    ☒ 1123
    ---
    Q  48+285
    T 666 
    ☒ 657 
    ---
    Q   551+8
    T 163 
    ☒ 165 
    ---
    Q  55+763
    T 422 
    ☒ 423 
    ---
    Q 752+006
    T 857 
    ☒ 777 
    ---
    Q 292+167
    T 1053
    ☒ 1023
    ---
    Q 795+038
    T 1427
    ☒ 1388
    ---
    Q  838+62
    T 864 
    ☒ 887 
    ---
    Q  39+341
    T 236 
    ☒ 223 
    ---
    val acc 5.140%
    | epoch 5 |  iter 1 / 351 | time 0[s] | loss 1.08
    | epoch 5 |  iter 21 / 351 | time 1[s] | loss 1.07
    | epoch 5 |  iter 41 / 351 | time 2[s] | loss 1.05
    | epoch 5 |  iter 61 / 351 | time 3[s] | loss 1.04
    | epoch 5 |  iter 81 / 351 | time 5[s] | loss 1.02
    | epoch 5 |  iter 101 / 351 | time 6[s] | loss 1.01
    | epoch 5 |  iter 121 / 351 | time 7[s] | loss 1.00
    | epoch 5 |  iter 141 / 351 | time 8[s] | loss 0.99
    | epoch 5 |  iter 161 / 351 | time 10[s] | loss 0.99
    | epoch 5 |  iter 181 / 351 | time 11[s] | loss 0.96
    | epoch 5 |  iter 201 / 351 | time 12[s] | loss 0.95
    | epoch 5 |  iter 221 / 351 | time 13[s] | loss 0.94
    | epoch 5 |  iter 241 / 351 | time 15[s] | loss 0.92
    | epoch 5 |  iter 261 / 351 | time 16[s] | loss 0.91
    | epoch 5 |  iter 281 / 351 | time 17[s] | loss 0.90
    | epoch 5 |  iter 301 / 351 | time 19[s] | loss 0.89
    | epoch 5 |  iter 321 / 351 | time 20[s] | loss 0.88
    | epoch 5 |  iter 341 / 351 | time 21[s] | loss 0.87
    Q   58+77
    T 162 
    ☒ 160 
    ---
    Q 461+579
    T 1139
    ☒ 1135
    ---
    Q  48+285
    T 666 
    ☒ 668 
    ---
    Q   551+8
    T 163 
    ☒ 169 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 861 
    ---
    Q 292+167
    T 1053
    ☒ 1045
    ---
    Q 795+038
    T 1427
    ☒ 1324
    ---
    Q  838+62
    T 864 
    ☒ 861 
    ---
    Q  39+341
    T 236 
    ☒ 239 
    ---
    val acc 9.380%
    | epoch 6 |  iter 1 / 351 | time 0[s] | loss 0.90
    | epoch 6 |  iter 21 / 351 | time 1[s] | loss 0.86
    | epoch 6 |  iter 41 / 351 | time 2[s] | loss 0.83
    | epoch 6 |  iter 61 / 351 | time 3[s] | loss 0.84
    | epoch 6 |  iter 81 / 351 | time 5[s] | loss 0.82
    | epoch 6 |  iter 101 / 351 | time 6[s] | loss 0.81
    | epoch 6 |  iter 121 / 351 | time 7[s] | loss 0.80
    | epoch 6 |  iter 141 / 351 | time 8[s] | loss 0.79
    | epoch 6 |  iter 161 / 351 | time 10[s] | loss 0.78
    | epoch 6 |  iter 181 / 351 | time 11[s] | loss 0.77
    | epoch 6 |  iter 201 / 351 | time 12[s] | loss 0.76
    | epoch 6 |  iter 221 / 351 | time 14[s] | loss 0.76
    | epoch 6 |  iter 241 / 351 | time 15[s] | loss 0.74
    | epoch 6 |  iter 261 / 351 | time 16[s] | loss 0.74
    | epoch 6 |  iter 281 / 351 | time 17[s] | loss 0.73
    | epoch 6 |  iter 301 / 351 | time 19[s] | loss 0.72
    | epoch 6 |  iter 321 / 351 | time 20[s] | loss 0.72
    | epoch 6 |  iter 341 / 351 | time 21[s] | loss 0.71
    Q   58+77
    T 162 
    ☒ 163 
    ---
    Q 461+579
    T 1139
    ☒ 1138
    ---
    Q  48+285
    T 666 
    ☒ 668 
    ---
    Q   551+8
    T 163 
    ☒ 166 
    ---
    Q  55+763
    T 422 
    ☒ 423 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☒ 1048
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☒ 873 
    ---
    Q  39+341
    T 236 
    ☒ 239 
    ---
    val acc 15.040%
    | epoch 7 |  iter 1 / 351 | time 0[s] | loss 0.68
    | epoch 7 |  iter 21 / 351 | time 1[s] | loss 0.69
    | epoch 7 |  iter 41 / 351 | time 2[s] | loss 0.67
    | epoch 7 |  iter 61 / 351 | time 3[s] | loss 0.66
    | epoch 7 |  iter 81 / 351 | time 5[s] | loss 0.66
    | epoch 7 |  iter 101 / 351 | time 6[s] | loss 0.65
    | epoch 7 |  iter 121 / 351 | time 7[s] | loss 0.65
    | epoch 7 |  iter 141 / 351 | time 8[s] | loss 0.64
    | epoch 7 |  iter 161 / 351 | time 10[s] | loss 0.63
    | epoch 7 |  iter 181 / 351 | time 11[s] | loss 0.61
    | epoch 7 |  iter 201 / 351 | time 12[s] | loss 0.61
    | epoch 7 |  iter 221 / 351 | time 13[s] | loss 0.60
    | epoch 7 |  iter 241 / 351 | time 15[s] | loss 0.57
    | epoch 7 |  iter 261 / 351 | time 16[s] | loss 0.57
    | epoch 7 |  iter 281 / 351 | time 17[s] | loss 0.57
    | epoch 7 |  iter 301 / 351 | time 19[s] | loss 0.55
    | epoch 7 |  iter 321 / 351 | time 20[s] | loss 0.54
    | epoch 7 |  iter 341 / 351 | time 21[s] | loss 0.53
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☒ 665 
    ---
    Q   551+8
    T 163 
    ☒ 156 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 858 
    ---
    Q 292+167
    T 1053
    ☒ 1052
    ---
    Q 795+038
    T 1427
    ☒ 1428
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☒ 235 
    ---
    val acc 39.100%
    | epoch 8 |  iter 1 / 351 | time 0[s] | loss 0.51
    | epoch 8 |  iter 21 / 351 | time 1[s] | loss 0.50
    | epoch 8 |  iter 41 / 351 | time 2[s] | loss 0.49
    | epoch 8 |  iter 61 / 351 | time 3[s] | loss 0.48
    | epoch 8 |  iter 81 / 351 | time 5[s] | loss 0.47
    | epoch 8 |  iter 101 / 351 | time 6[s] | loss 0.46
    | epoch 8 |  iter 121 / 351 | time 7[s] | loss 0.46
    | epoch 8 |  iter 141 / 351 | time 8[s] | loss 0.44
    | epoch 8 |  iter 161 / 351 | time 10[s] | loss 0.41
    | epoch 8 |  iter 181 / 351 | time 11[s] | loss 0.42
    | epoch 8 |  iter 201 / 351 | time 12[s] | loss 0.41
    | epoch 8 |  iter 221 / 351 | time 14[s] | loss 0.40
    | epoch 8 |  iter 241 / 351 | time 15[s] | loss 0.39
    | epoch 8 |  iter 261 / 351 | time 16[s] | loss 0.37
    | epoch 8 |  iter 281 / 351 | time 17[s] | loss 0.36
    | epoch 8 |  iter 301 / 351 | time 19[s] | loss 0.36
    | epoch 8 |  iter 321 / 351 | time 20[s] | loss 0.35
    | epoch 8 |  iter 341 / 351 | time 21[s] | loss 0.34
    Q   58+77
    T 162 
    ☒ 161 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☒ 657 
    ---
    Q   551+8
    T 163 
    ☒ 155 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1438
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 65.060%
    | epoch 9 |  iter 1 / 351 | time 0[s] | loss 0.32
    | epoch 9 |  iter 21 / 351 | time 1[s] | loss 0.31
    | epoch 9 |  iter 41 / 351 | time 2[s] | loss 0.31
    | epoch 9 |  iter 61 / 351 | time 3[s] | loss 0.31
    | epoch 9 |  iter 81 / 351 | time 5[s] | loss 0.29
    | epoch 9 |  iter 101 / 351 | time 6[s] | loss 0.29
    | epoch 9 |  iter 121 / 351 | time 7[s] | loss 0.29
    | epoch 9 |  iter 141 / 351 | time 8[s] | loss 0.27
    | epoch 9 |  iter 161 / 351 | time 10[s] | loss 0.27
    | epoch 9 |  iter 181 / 351 | time 11[s] | loss 0.26
    | epoch 9 |  iter 201 / 351 | time 12[s] | loss 0.25
    | epoch 9 |  iter 221 / 351 | time 13[s] | loss 0.25
    | epoch 9 |  iter 241 / 351 | time 15[s] | loss 0.24
    | epoch 9 |  iter 261 / 351 | time 16[s] | loss 0.24
    | epoch 9 |  iter 281 / 351 | time 17[s] | loss 0.23
    | epoch 9 |  iter 301 / 351 | time 19[s] | loss 0.22
    | epoch 9 |  iter 321 / 351 | time 20[s] | loss 0.22
    | epoch 9 |  iter 341 / 351 | time 21[s] | loss 0.21
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☒ 1140
    ---
    Q  48+285
    T 666 
    ☒ 657 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 83.280%
    | epoch 10 |  iter 1 / 351 | time 0[s] | loss 0.22
    | epoch 10 |  iter 21 / 351 | time 1[s] | loss 0.20
    | epoch 10 |  iter 41 / 351 | time 2[s] | loss 0.20
    | epoch 10 |  iter 61 / 351 | time 3[s] | loss 0.20
    | epoch 10 |  iter 81 / 351 | time 5[s] | loss 0.18
    | epoch 10 |  iter 101 / 351 | time 6[s] | loss 0.17
    | epoch 10 |  iter 121 / 351 | time 7[s] | loss 0.18
    | epoch 10 |  iter 141 / 351 | time 9[s] | loss 0.17
    | epoch 10 |  iter 161 / 351 | time 10[s] | loss 0.17
    | epoch 10 |  iter 181 / 351 | time 11[s] | loss 0.17
    | epoch 10 |  iter 201 / 351 | time 12[s] | loss 0.17
    | epoch 10 |  iter 221 / 351 | time 14[s] | loss 0.16
    | epoch 10 |  iter 241 / 351 | time 15[s] | loss 0.15
    | epoch 10 |  iter 261 / 351 | time 16[s] | loss 0.15
    | epoch 10 |  iter 281 / 351 | time 17[s] | loss 0.15
    | epoch 10 |  iter 301 / 351 | time 19[s] | loss 0.15
    | epoch 10 |  iter 321 / 351 | time 20[s] | loss 0.14
    | epoch 10 |  iter 341 / 351 | time 21[s] | loss 0.14
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☒ 656 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 88.400%
    | epoch 11 |  iter 1 / 351 | time 0[s] | loss 0.13
    | epoch 11 |  iter 21 / 351 | time 1[s] | loss 0.13
    | epoch 11 |  iter 41 / 351 | time 2[s] | loss 0.13
    | epoch 11 |  iter 61 / 351 | time 3[s] | loss 0.12
    | epoch 11 |  iter 81 / 351 | time 5[s] | loss 0.12
    | epoch 11 |  iter 101 / 351 | time 6[s] | loss 0.12
    | epoch 11 |  iter 121 / 351 | time 7[s] | loss 0.11
    | epoch 11 |  iter 141 / 351 | time 9[s] | loss 0.12
    | epoch 11 |  iter 161 / 351 | time 10[s] | loss 0.11
    | epoch 11 |  iter 181 / 351 | time 11[s] | loss 0.11
    | epoch 11 |  iter 201 / 351 | time 12[s] | loss 0.12
    | epoch 11 |  iter 221 / 351 | time 14[s] | loss 0.11
    | epoch 11 |  iter 241 / 351 | time 15[s] | loss 0.11
    | epoch 11 |  iter 261 / 351 | time 16[s] | loss 0.10
    | epoch 11 |  iter 281 / 351 | time 17[s] | loss 0.10
    | epoch 11 |  iter 301 / 351 | time 19[s] | loss 0.10
    | epoch 11 |  iter 321 / 351 | time 20[s] | loss 0.09
    | epoch 11 |  iter 341 / 351 | time 21[s] | loss 0.09
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 90.940%
    | epoch 12 |  iter 1 / 351 | time 0[s] | loss 0.09
    | epoch 12 |  iter 21 / 351 | time 1[s] | loss 0.09
    | epoch 12 |  iter 41 / 351 | time 2[s] | loss 0.09
    | epoch 12 |  iter 61 / 351 | time 3[s] | loss 0.09
    | epoch 12 |  iter 81 / 351 | time 5[s] | loss 0.09
    | epoch 12 |  iter 101 / 351 | time 6[s] | loss 0.08
    | epoch 12 |  iter 121 / 351 | time 7[s] | loss 0.08
    | epoch 12 |  iter 141 / 351 | time 8[s] | loss 0.08
    | epoch 12 |  iter 161 / 351 | time 10[s] | loss 0.08
    | epoch 12 |  iter 181 / 351 | time 11[s] | loss 0.08
    | epoch 12 |  iter 201 / 351 | time 12[s] | loss 0.08
    | epoch 12 |  iter 221 / 351 | time 14[s] | loss 0.09
    | epoch 12 |  iter 241 / 351 | time 15[s] | loss 0.09
    | epoch 12 |  iter 261 / 351 | time 16[s] | loss 0.09
    | epoch 12 |  iter 281 / 351 | time 17[s] | loss 0.08
    | epoch 12 |  iter 301 / 351 | time 19[s] | loss 0.08
    | epoch 12 |  iter 321 / 351 | time 20[s] | loss 0.07
    | epoch 12 |  iter 341 / 351 | time 21[s] | loss 0.08
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 92.220%
    | epoch 13 |  iter 1 / 351 | time 0[s] | loss 0.07
    | epoch 13 |  iter 21 / 351 | time 1[s] | loss 0.07
    | epoch 13 |  iter 41 / 351 | time 2[s] | loss 0.07
    | epoch 13 |  iter 61 / 351 | time 3[s] | loss 0.07
    | epoch 13 |  iter 81 / 351 | time 5[s] | loss 0.06
    | epoch 13 |  iter 101 / 351 | time 6[s] | loss 0.06
    | epoch 13 |  iter 121 / 351 | time 7[s] | loss 0.07
    | epoch 13 |  iter 141 / 351 | time 8[s] | loss 0.06
    | epoch 13 |  iter 161 / 351 | time 10[s] | loss 0.06
    | epoch 13 |  iter 181 / 351 | time 11[s] | loss 0.06
    | epoch 13 |  iter 201 / 351 | time 12[s] | loss 0.06
    | epoch 13 |  iter 221 / 351 | time 13[s] | loss 0.06
    | epoch 13 |  iter 241 / 351 | time 15[s] | loss 0.06
    | epoch 13 |  iter 261 / 351 | time 16[s] | loss 0.06
    | epoch 13 |  iter 281 / 351 | time 17[s] | loss 0.06
    | epoch 13 |  iter 301 / 351 | time 18[s] | loss 0.05
    | epoch 13 |  iter 321 / 351 | time 20[s] | loss 0.05
    | epoch 13 |  iter 341 / 351 | time 21[s] | loss 0.06
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 94.420%
    | epoch 14 |  iter 1 / 351 | time 0[s] | loss 0.05
    | epoch 14 |  iter 21 / 351 | time 1[s] | loss 0.05
    | epoch 14 |  iter 41 / 351 | time 2[s] | loss 0.05
    | epoch 14 |  iter 61 / 351 | time 3[s] | loss 0.05
    | epoch 14 |  iter 81 / 351 | time 5[s] | loss 0.05
    | epoch 14 |  iter 101 / 351 | time 6[s] | loss 0.05
    | epoch 14 |  iter 121 / 351 | time 7[s] | loss 0.05
    | epoch 14 |  iter 141 / 351 | time 8[s] | loss 0.05
    | epoch 14 |  iter 161 / 351 | time 10[s] | loss 0.05
    | epoch 14 |  iter 181 / 351 | time 11[s] | loss 0.05
    | epoch 14 |  iter 201 / 351 | time 12[s] | loss 0.05
    | epoch 14 |  iter 221 / 351 | time 13[s] | loss 0.06
    | epoch 14 |  iter 241 / 351 | time 15[s] | loss 0.06
    | epoch 14 |  iter 261 / 351 | time 16[s] | loss 0.07
    | epoch 14 |  iter 281 / 351 | time 17[s] | loss 0.06
    | epoch 14 |  iter 301 / 351 | time 19[s] | loss 0.06
    | epoch 14 |  iter 321 / 351 | time 20[s] | loss 0.05
    | epoch 14 |  iter 341 / 351 | time 21[s] | loss 0.05
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 94.340%
    | epoch 15 |  iter 1 / 351 | time 0[s] | loss 0.04
    | epoch 15 |  iter 21 / 351 | time 1[s] | loss 0.04
    | epoch 15 |  iter 41 / 351 | time 2[s] | loss 0.04
    | epoch 15 |  iter 61 / 351 | time 3[s] | loss 0.05
    | epoch 15 |  iter 81 / 351 | time 5[s] | loss 0.04
    | epoch 15 |  iter 101 / 351 | time 6[s] | loss 0.05
    | epoch 15 |  iter 121 / 351 | time 7[s] | loss 0.04
    | epoch 15 |  iter 141 / 351 | time 8[s] | loss 0.04
    | epoch 15 |  iter 161 / 351 | time 10[s] | loss 0.04
    | epoch 15 |  iter 181 / 351 | time 11[s] | loss 0.05
    | epoch 15 |  iter 201 / 351 | time 12[s] | loss 0.04
    | epoch 15 |  iter 221 / 351 | time 13[s] | loss 0.04
    | epoch 15 |  iter 241 / 351 | time 15[s] | loss 0.03
    | epoch 15 |  iter 261 / 351 | time 16[s] | loss 0.04
    | epoch 15 |  iter 281 / 351 | time 17[s] | loss 0.04
    | epoch 15 |  iter 301 / 351 | time 18[s] | loss 0.05
    | epoch 15 |  iter 321 / 351 | time 20[s] | loss 0.04
    | epoch 15 |  iter 341 / 351 | time 21[s] | loss 0.04
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 94.760%
    | epoch 16 |  iter 1 / 351 | time 0[s] | loss 0.03
    | epoch 16 |  iter 21 / 351 | time 1[s] | loss 0.05
    | epoch 16 |  iter 41 / 351 | time 2[s] | loss 0.06
    | epoch 16 |  iter 61 / 351 | time 3[s] | loss 0.05
    | epoch 16 |  iter 81 / 351 | time 5[s] | loss 0.04
    | epoch 16 |  iter 101 / 351 | time 6[s] | loss 0.04
    | epoch 16 |  iter 121 / 351 | time 7[s] | loss 0.04
    | epoch 16 |  iter 141 / 351 | time 8[s] | loss 0.04
    | epoch 16 |  iter 161 / 351 | time 10[s] | loss 0.04
    | epoch 16 |  iter 181 / 351 | time 11[s] | loss 0.04
    | epoch 16 |  iter 201 / 351 | time 12[s] | loss 0.05
    | epoch 16 |  iter 221 / 351 | time 13[s] | loss 0.05
    | epoch 16 |  iter 241 / 351 | time 15[s] | loss 0.04
    | epoch 16 |  iter 261 / 351 | time 16[s] | loss 0.04
    | epoch 16 |  iter 281 / 351 | time 17[s] | loss 0.03
    | epoch 16 |  iter 301 / 351 | time 19[s] | loss 0.03
    | epoch 16 |  iter 321 / 351 | time 20[s] | loss 0.04
    | epoch 16 |  iter 341 / 351 | time 21[s] | loss 0.04
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 96.080%
    | epoch 17 |  iter 1 / 351 | time 0[s] | loss 0.04
    | epoch 17 |  iter 21 / 351 | time 1[s] | loss 0.03
    | epoch 17 |  iter 41 / 351 | time 2[s] | loss 0.03
    | epoch 17 |  iter 61 / 351 | time 3[s] | loss 0.03
    | epoch 17 |  iter 81 / 351 | time 5[s] | loss 0.03
    | epoch 17 |  iter 101 / 351 | time 6[s] | loss 0.03
    | epoch 17 |  iter 121 / 351 | time 7[s] | loss 0.02
    | epoch 17 |  iter 141 / 351 | time 8[s] | loss 0.02
    | epoch 17 |  iter 161 / 351 | time 10[s] | loss 0.03
    | epoch 17 |  iter 181 / 351 | time 11[s] | loss 0.03
    | epoch 17 |  iter 201 / 351 | time 12[s] | loss 0.03
    | epoch 17 |  iter 221 / 351 | time 13[s] | loss 0.03
    | epoch 17 |  iter 241 / 351 | time 15[s] | loss 0.03
    | epoch 17 |  iter 261 / 351 | time 16[s] | loss 0.03
    | epoch 17 |  iter 281 / 351 | time 17[s] | loss 0.03
    | epoch 17 |  iter 301 / 351 | time 18[s] | loss 0.03
    | epoch 17 |  iter 321 / 351 | time 20[s] | loss 0.04
    | epoch 17 |  iter 341 / 351 | time 21[s] | loss 0.05
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☒ 856 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 91.420%
    | epoch 18 |  iter 1 / 351 | time 0[s] | loss 0.06
    | epoch 18 |  iter 21 / 351 | time 1[s] | loss 0.05
    | epoch 18 |  iter 41 / 351 | time 2[s] | loss 0.05
    | epoch 18 |  iter 61 / 351 | time 3[s] | loss 0.05
    | epoch 18 |  iter 81 / 351 | time 5[s] | loss 0.05
    | epoch 18 |  iter 101 / 351 | time 6[s] | loss 0.04
    | epoch 18 |  iter 121 / 351 | time 7[s] | loss 0.03
    | epoch 18 |  iter 141 / 351 | time 8[s] | loss 0.03
    | epoch 18 |  iter 161 / 351 | time 10[s] | loss 0.03
    | epoch 18 |  iter 181 / 351 | time 11[s] | loss 0.02
    | epoch 18 |  iter 201 / 351 | time 12[s] | loss 0.02
    | epoch 18 |  iter 221 / 351 | time 13[s] | loss 0.02
    | epoch 18 |  iter 241 / 351 | time 15[s] | loss 0.02
    | epoch 18 |  iter 261 / 351 | time 16[s] | loss 0.02
    | epoch 18 |  iter 281 / 351 | time 17[s] | loss 0.02
    | epoch 18 |  iter 301 / 351 | time 19[s] | loss 0.02
    | epoch 18 |  iter 321 / 351 | time 20[s] | loss 0.02
    | epoch 18 |  iter 341 / 351 | time 21[s] | loss 0.02
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 98.320%
    | epoch 19 |  iter 1 / 351 | time 0[s] | loss 0.01
    | epoch 19 |  iter 21 / 351 | time 1[s] | loss 0.02
    | epoch 19 |  iter 41 / 351 | time 2[s] | loss 0.02
    | epoch 19 |  iter 61 / 351 | time 3[s] | loss 0.02
    | epoch 19 |  iter 81 / 351 | time 5[s] | loss 0.02
    | epoch 19 |  iter 101 / 351 | time 6[s] | loss 0.02
    | epoch 19 |  iter 121 / 351 | time 7[s] | loss 0.03
    | epoch 19 |  iter 141 / 351 | time 8[s] | loss 0.03
    | epoch 19 |  iter 161 / 351 | time 10[s] | loss 0.03
    | epoch 19 |  iter 181 / 351 | time 11[s] | loss 0.04
    | epoch 19 |  iter 201 / 351 | time 12[s] | loss 0.04
    | epoch 19 |  iter 221 / 351 | time 13[s] | loss 0.03
    | epoch 19 |  iter 241 / 351 | time 15[s] | loss 0.03
    | epoch 19 |  iter 261 / 351 | time 16[s] | loss 0.03
    | epoch 19 |  iter 281 / 351 | time 17[s] | loss 0.03
    | epoch 19 |  iter 301 / 351 | time 18[s] | loss 0.02
    | epoch 19 |  iter 321 / 351 | time 20[s] | loss 0.03
    | epoch 19 |  iter 341 / 351 | time 21[s] | loss 0.02
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 97.220%
    | epoch 20 |  iter 1 / 351 | time 0[s] | loss 0.03
    | epoch 20 |  iter 21 / 351 | time 1[s] | loss 0.02
    | epoch 20 |  iter 41 / 351 | time 2[s] | loss 0.04
    | epoch 20 |  iter 61 / 351 | time 3[s] | loss 0.03
    | epoch 20 |  iter 81 / 351 | time 5[s] | loss 0.04
    | epoch 20 |  iter 101 / 351 | time 6[s] | loss 0.03
    | epoch 20 |  iter 121 / 351 | time 7[s] | loss 0.03
    | epoch 20 |  iter 141 / 351 | time 8[s] | loss 0.03
    | epoch 20 |  iter 161 / 351 | time 10[s] | loss 0.02
    | epoch 20 |  iter 181 / 351 | time 11[s] | loss 0.03
    | epoch 20 |  iter 201 / 351 | time 12[s] | loss 0.02
    | epoch 20 |  iter 221 / 351 | time 13[s] | loss 0.02
    | epoch 20 |  iter 241 / 351 | time 15[s] | loss 0.02
    | epoch 20 |  iter 261 / 351 | time 16[s] | loss 0.02
    | epoch 20 |  iter 281 / 351 | time 17[s] | loss 0.03
    | epoch 20 |  iter 301 / 351 | time 18[s] | loss 0.02
    | epoch 20 |  iter 321 / 351 | time 20[s] | loss 0.02
    | epoch 20 |  iter 341 / 351 | time 21[s] | loss 0.03
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☒ 1437
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 95.080%
    | epoch 21 |  iter 1 / 351 | time 0[s] | loss 0.03
    | epoch 21 |  iter 21 / 351 | time 1[s] | loss 0.03
    | epoch 21 |  iter 41 / 351 | time 2[s] | loss 0.02
    | epoch 21 |  iter 61 / 351 | time 3[s] | loss 0.02
    | epoch 21 |  iter 81 / 351 | time 5[s] | loss 0.02
    | epoch 21 |  iter 101 / 351 | time 6[s] | loss 0.02
    | epoch 21 |  iter 121 / 351 | time 7[s] | loss 0.02
    | epoch 21 |  iter 141 / 351 | time 8[s] | loss 0.02
    | epoch 21 |  iter 161 / 351 | time 10[s] | loss 0.02
    | epoch 21 |  iter 181 / 351 | time 11[s] | loss 0.02
    | epoch 21 |  iter 201 / 351 | time 12[s] | loss 0.02
    | epoch 21 |  iter 221 / 351 | time 14[s] | loss 0.01
    | epoch 21 |  iter 241 / 351 | time 15[s] | loss 0.01
    | epoch 21 |  iter 261 / 351 | time 16[s] | loss 0.01
    | epoch 21 |  iter 281 / 351 | time 17[s] | loss 0.02
    | epoch 21 |  iter 301 / 351 | time 19[s] | loss 0.02
    | epoch 21 |  iter 321 / 351 | time 20[s] | loss 0.02
    | epoch 21 |  iter 341 / 351 | time 21[s] | loss 0.02
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 97.480%
    | epoch 22 |  iter 1 / 351 | time 0[s] | loss 0.03
    | epoch 22 |  iter 21 / 351 | time 1[s] | loss 0.03
    | epoch 22 |  iter 41 / 351 | time 2[s] | loss 0.02
    | epoch 22 |  iter 61 / 351 | time 3[s] | loss 0.02
    | epoch 22 |  iter 81 / 351 | time 5[s] | loss 0.02
    | epoch 22 |  iter 101 / 351 | time 6[s] | loss 0.02
    | epoch 22 |  iter 121 / 351 | time 7[s] | loss 0.02
    | epoch 22 |  iter 141 / 351 | time 8[s] | loss 0.02
    | epoch 22 |  iter 161 / 351 | time 10[s] | loss 0.02
    | epoch 22 |  iter 181 / 351 | time 11[s] | loss 0.02
    | epoch 22 |  iter 201 / 351 | time 12[s] | loss 0.02
    | epoch 22 |  iter 221 / 351 | time 13[s] | loss 0.02
    | epoch 22 |  iter 241 / 351 | time 15[s] | loss 0.02
    | epoch 22 |  iter 261 / 351 | time 16[s] | loss 0.03
    | epoch 22 |  iter 281 / 351 | time 17[s] | loss 0.04
    | epoch 22 |  iter 301 / 351 | time 18[s] | loss 0.03
    | epoch 22 |  iter 321 / 351 | time 20[s] | loss 0.03
    | epoch 22 |  iter 341 / 351 | time 21[s] | loss 0.02
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 95.020%
    | epoch 23 |  iter 1 / 351 | time 0[s] | loss 0.04
    | epoch 23 |  iter 21 / 351 | time 1[s] | loss 0.03
    | epoch 23 |  iter 41 / 351 | time 2[s] | loss 0.03
    | epoch 23 |  iter 61 / 351 | time 3[s] | loss 0.03
    | epoch 23 |  iter 81 / 351 | time 5[s] | loss 0.02
    | epoch 23 |  iter 101 / 351 | time 6[s] | loss 0.02
    | epoch 23 |  iter 121 / 351 | time 7[s] | loss 0.01
    | epoch 23 |  iter 141 / 351 | time 9[s] | loss 0.02
    | epoch 23 |  iter 161 / 351 | time 10[s] | loss 0.01
    | epoch 23 |  iter 181 / 351 | time 11[s] | loss 0.02
    | epoch 23 |  iter 201 / 351 | time 12[s] | loss 0.02
    | epoch 23 |  iter 221 / 351 | time 14[s] | loss 0.02
    | epoch 23 |  iter 241 / 351 | time 15[s] | loss 0.02
    | epoch 23 |  iter 261 / 351 | time 16[s] | loss 0.03
    | epoch 23 |  iter 281 / 351 | time 17[s] | loss 0.02
    | epoch 23 |  iter 301 / 351 | time 19[s] | loss 0.02
    | epoch 23 |  iter 321 / 351 | time 20[s] | loss 0.03
    | epoch 23 |  iter 341 / 351 | time 21[s] | loss 0.04
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☒ 854 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 93.260%
    | epoch 24 |  iter 1 / 351 | time 0[s] | loss 0.04
    | epoch 24 |  iter 21 / 351 | time 1[s] | loss 0.03
    | epoch 24 |  iter 41 / 351 | time 2[s] | loss 0.03
    | epoch 24 |  iter 61 / 351 | time 3[s] | loss 0.03
    | epoch 24 |  iter 81 / 351 | time 5[s] | loss 0.02
    | epoch 24 |  iter 101 / 351 | time 6[s] | loss 0.01
    | epoch 24 |  iter 121 / 351 | time 7[s] | loss 0.02
    | epoch 24 |  iter 141 / 351 | time 8[s] | loss 0.01
    | epoch 24 |  iter 161 / 351 | time 10[s] | loss 0.01
    | epoch 24 |  iter 181 / 351 | time 11[s] | loss 0.01
    | epoch 24 |  iter 201 / 351 | time 12[s] | loss 0.01
    | epoch 24 |  iter 221 / 351 | time 13[s] | loss 0.02
    | epoch 24 |  iter 241 / 351 | time 15[s] | loss 0.03
    | epoch 24 |  iter 261 / 351 | time 16[s] | loss 0.03
    | epoch 24 |  iter 281 / 351 | time 17[s] | loss 0.03
    | epoch 24 |  iter 301 / 351 | time 18[s] | loss 0.03
    | epoch 24 |  iter 321 / 351 | time 20[s] | loss 0.02
    | epoch 24 |  iter 341 / 351 | time 21[s] | loss 0.02
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 97.800%
    | epoch 25 |  iter 1 / 351 | time 0[s] | loss 0.02
    | epoch 25 |  iter 21 / 351 | time 1[s] | loss 0.01
    | epoch 25 |  iter 41 / 351 | time 2[s] | loss 0.01
    | epoch 25 |  iter 61 / 351 | time 3[s] | loss 0.01
    | epoch 25 |  iter 81 / 351 | time 5[s] | loss 0.01
    | epoch 25 |  iter 101 / 351 | time 6[s] | loss 0.01
    | epoch 25 |  iter 121 / 351 | time 7[s] | loss 0.01
    | epoch 25 |  iter 141 / 351 | time 9[s] | loss 0.01
    | epoch 25 |  iter 161 / 351 | time 10[s] | loss 0.01
    | epoch 25 |  iter 181 / 351 | time 11[s] | loss 0.01
    | epoch 25 |  iter 201 / 351 | time 12[s] | loss 0.01
    | epoch 25 |  iter 221 / 351 | time 14[s] | loss 0.01
    | epoch 25 |  iter 241 / 351 | time 15[s] | loss 0.01
    | epoch 25 |  iter 261 / 351 | time 16[s] | loss 0.01
    | epoch 25 |  iter 281 / 351 | time 17[s] | loss 0.01
    | epoch 25 |  iter 301 / 351 | time 19[s] | loss 0.01
    | epoch 25 |  iter 321 / 351 | time 20[s] | loss 0.01
    | epoch 25 |  iter 341 / 351 | time 21[s] | loss 0.01
    Q   58+77
    T 162 
    ☑ 162 
    ---
    Q 461+579
    T 1139
    ☑ 1139
    ---
    Q  48+285
    T 666 
    ☑ 666 
    ---
    Q   551+8
    T 163 
    ☑ 163 
    ---
    Q  55+763
    T 422 
    ☑ 422 
    ---
    Q 752+006
    T 857 
    ☑ 857 
    ---
    Q 292+167
    T 1053
    ☑ 1053
    ---
    Q 795+038
    T 1427
    ☑ 1427
    ---
    Q  838+62
    T 864 
    ☑ 864 
    ---
    Q  39+341
    T 236 
    ☑ 236 
    ---
    val acc 97.760%
    

    %python3
    plt.ylim(0, 1)
    plt.plot(acc_list_baseline)
    plt.plot(acc_list_reversed)
    plt.plot(acc_list_peeky)
    plt.legend(labels=['baseline', 'reversed input', 'peeky'])
    plt.show()

    • 最初の正解は遅い気がする
    • が、正解しだすと一気に賢くなる