Table of Contents
1. Numerical Stability and Initialization
1.1. Vanishing and Exploding Gradients
1.1.1. Vanishing Gradients
(ns clj-d2l.numerical-stability (:require [clojure.spec.alpha :as s] [clj-djl.ndarray :as nd] [clj-djl.training :as t] [clj-djl.training.dataset :as ds] [clj-djl.training.loss :as loss] [clj-djl.training.optimizer :as optimizer] [clj-djl.training.tracker :as tracker] [clj-djl.training.listener :as listener] [clj-djl.model :as m] [clj-djl.nn :as nn] [clj-djl.device :as dev] [clj-d2l.core :as d2l]))
(def ndm (nd/base-manager)) (def x (nd/arange ndm -8.0 8.0 0.1)) (t/attach-gradient x) (with-open [gc (t/gradient-collector)] (let [y (nn/sigmoid x)] (t/backward gc y) (d2l/plot-lines "figure/numerical_stability_1.svg" ["sigmoid" "gradient"] (nd/to-vec x) [(nd/to-vec y) (nd/to-vec (t/get-gradient x))])))
(def M (nd/random-normal ndm [4 4])) M
ND: (4, 4) cpu() float32 [[ 2.2122, 1.1631, 0.774 , 0.4838], [ 1.0434, 0.2996, 1.1839, 0.153 ], [ 1.8917, -1.1688, -1.2347, 1.5581], [-1.771 , -0.5459, -0.4514, -2.3556], ]
(reduce nd/dot M (repeatedly 100 #(nd/random-normal ndm [4 4])))
ND: (4, 4) cpu() float32 [[ 2.23112318e+24, -4.42056910e+23, 6.86103472e+24, 6.96760271e+24], [-7.82124006e+23, 1.54969296e+23, -2.40508260e+24, -2.44240483e+24], [ 5.04485362e+24, -9.99554626e+23, 1.55135976e+25, 1.57545294e+25], [-9.53739183e+23, 1.88953602e+23, -2.93306315e+24, -2.97871740e+24], ]