smbot/nn.lua

897 lines
22 KiB
Lua

local assert = assert
local ceil = math.ceil
local cos = math.cos
local exp = math.exp
local floor = math.floor
local huge = math.huge
local insert = table.insert
local ipairs = ipairs
local log = math.log
local max = math.max
local min = math.min
local open = io.open
local pairs = pairs
local pi = math.pi
local print = print
local remove = table.remove
local sin = math.sin
local sqrt = math.sqrt
local tanh = math.tanh
local tostring = tostring
local uniform = math.random
local unpack = table.unpack or unpack
local Base = require("Base")
local util = require("util")
-- hacks
local function helpme() print(debug.traceback('helpme', 2):gsub("\n", "\r\n")) end
-- math utilities
local function prod(x, ...)
if type(x) == "table" then
return prod(unpack(x))
end
local ret = x
for i = 1, select("#", ...) do
ret = ret * select(i, ...)
end
return ret
end
local function normal() -- box muller
return sqrt(-2 * log(uniform() + 1e-8) + 1e-8) * cos(2 * pi * uniform())
end
local function zeros(n, out)
out = out or {}
if type(n) == 'table' then
local shape = n
n = prod(shape)
out.shape = shape
end
for i = 1, n do out[i] = 0 end
return out
end
local function arange(n, out)
out = out or {}
if type(n) == 'table' then
local shape = n
n = prod(shape)
out.shape = shape
end
for i = 1, n do out[i] = i - 1 end
return out
end
local function allocate(size, out, init)
out = out or {}
if init ~= nil then
return init(zeros(size, out))
else
return zeros(size, out)
end
end
local function init_zeros(t, fan_in, fan_out)
for i = 1, #t do t[i] = 0 end
return t
end
local function init_uniform(t, fan_in, fan_out)
for i = 1, #t do t[i] = uniform() * 2 - 1 end
return t
end
local function init_normal(t, fan_in, fan_out)
for i = 1, #t do t[i] = normal() end
return t
end
local function init_he_uniform(t, fan_in, fan_out)
local s = sqrt(6 / fan_in)
for i = 1, #t do t[i] = (uniform() * 2 - 1) * s end
return t
end
local function init_he_normal(t, fan_in, fan_out)
local s = sqrt(2 / fan_in)
for i = 1, #t do t[i] = normal() * s end
return t
end
-- ndarray-ish stuff and more involved math
local function pp(t, fmt, sep, ti, di, depth, isfirst, islast)
-- pretty-prints an nd-array.
fmt = fmt or '%10.7f,'
sep = sep or ','
ti = ti or 0
di = di or 1
depth = depth or 0
if t.shape == nil then
local s = '['
for i = 1, #t do s = s..fmt:format(t[i]) end
return s..']'..sep..'\n'
end
local dim = t.shape[di]
local ti_step = 1
for dj = di + 1, #t.shape do ti_step = ti_step * t.shape[dj] end
local indent = ''
for i = 1, depth do indent = indent..' ' end
local s = ''
if di ~= #t.shape then
if isfirst then s = s..indent..'[\n' else s = s..'[\n' end
for i = 1, dim do
s = s..pp(t, fmt, sep, ti, di + 1, depth + 1, i == 1, i == dim)
ti = ti + ti_step
end
if islast then s = s..indent..']'..sep..'\n' else s = s..indent..']'..sep end
else
s = s..indent..'['
for i = ti + 1, ti + dim do s = s..fmt:format(t[i])..sep end
s = s..']'..sep..'\n'
end
return s
end
local function ppi(t, n, ...)
-- TODO: determine maximum number of digits if n is omitted.
n = n or 1
return pp(t, '%'..tostring(n)..'i', ' ', ...)
end
local function checkshape_helper(shape, isbatch)
local s = '{ '
if not isbatch then
s = s..'n, '
end
for i, v in ipairs(shape) do
if not isbatch or i > 1 then
s = s..tostring(v)..(i ~= #shape and ', ' or ' ')
end
end
return s..'}'
end
local function checkshape(batch, shape)
assert(type(batch) == 'table', "batch is not an array")
assert(batch.shape ~= nil, "batch is missing a shape")
if #batch.shape == 1 then
error("batch shape is incomplete", 2)
end
for n=1, #shape do
if batch.shape[n+1] ~= shape[n] then
local s1 = checkshape_helper(batch.shape, true)
local s2 = checkshape_helper(shape, false)
error("shapes do not match: "..s1.." ~= "..s2, 2)
end
end
return batch.shape[1]
end
local function reshape(a, ...)
local new_shape = {...}
assert(#a == prod(new_shape), "new shape does not fit size")
a.shape = new_shape
return a
end
local function cache(bs, shape)
if bs == nil then return nil end
local fullshape = util.copy(shape)
insert(fullshape, bs, 1)
return zeros(fullshape)
end
local function dot_mv(mat, vec, out)
-- treats matrix as a matrix.
-- treats vec as a column vector, flattened.
assert(#mat.shape == 2)
local d0, d1 = unpack(mat.shape)
assert(d1 == #vec)
local out_shape = {d0}
if out == nil then
out = zeros(out_shape)
else
assert(d0 == #out, "given output is the wrong size")
end
for i=1, d0 do
local sum = 0
for j=1, d1 do
sum = sum + mat[(i - 1) * d1 + j] * vec[j]
end
out[i] = sum
end
return out
end
local function dot(a, b, ax_a, ax_b, out)
ax_a = ax_a or #a.shape - 0
ax_b = ax_b or #b.shape - 1
assert(a.shape[ax_a] == b.shape[ax_b], "dotted axes do not match")
local dim = a.shape[ax_a]
local out_shape = {}
for di = 1, #a.shape do if di ~= ax_a then insert(out_shape, a.shape[di]) end end
for di = 1, #b.shape do if di ~= ax_b then insert(out_shape, b.shape[di]) end end
if out == nil then
out = zeros(prod(out_shape))
else
assert(prod(out_shape) == #out, "given output is the wrong size")
end
out.shape = out_shape
local a0 = 1
local a1 = 1
local b0 = 1
local b1 = 1
for di = 1, ax_a - 1 do a0 = a0 * a.shape[di] end
for di = 1, ax_b - 1 do b0 = b0 * b.shape[di] end
for di = ax_a + 1, #a.shape do a1 = a1 * a.shape[di] end
for di = ax_b + 1, #b.shape do b1 = b1 * b.shape[di] end
local o = 1
local i_end = a0 * dim - 1
local k_end = b0 * dim - 1
for i = 0, i_end, dim do for j = 1, a1 do
for k = 0, k_end, dim do for m = 1, b1 do
local res = 0
local x = i + j
local y = k + m
for d = 1, dim do
res = res + a[x] * b[y]
x = x + a1
y = y + b1
end
out[o] = res
o = o + 1
end end
end end
return out
end
-- nodal
local function traverse(node_in, node_out, nodes, dummy_mode)
-- i have no idea if this is any algorithm in particular.
nodes = nodes or {}
local seen_up = {}
local q = {node_out}
while #q > 0 do
local node = remove(q, 1)
seen_up[node] = true
for _, parent in ipairs(node.parents) do insert(q, parent) end
end
if dummy_mode then seen_up[node_in] = true end
nodes = {}
q = {node_in}
while #q > 0 do
local node = remove(q, 1)
if seen_up[node] then
local all_parents_added = true
for _, parent in ipairs(node.parents) do
if not util.contains(nodes, parent) then
all_parents_added = false
break
end
end
if not util.contains(nodes, node) and all_parents_added then
insert(nodes, node)
end
for _, child in ipairs(node.children) do insert(q, child) end
end
end
if dummy_mode then remove(nodes, util.indexof(nodes, node_in)) end
return nodes
end
local function traverse_all(nodes_in, nodes_out, nodes)
local all_in = {children={}, parents={}}
local all_out = {children={}, parents={}}
for _, node in ipairs(nodes_in) do insert(all_in.children, node) end
for _, node in ipairs(nodes_out) do insert(all_out.parents, node) end
return traverse(all_in, all_out, nodes or {}, true)
end
-- classes
local Weights = Base:extend()
local Layer = Base:extend()
local Model = Base:extend()
local Input = Layer:extend()
local Merge = Layer:extend()
local Reshape = Layer:extend()
local Relu = Layer:extend()
local Gelu = Layer:extend()
local Cos = Layer:extend()
local Tanh = Layer:extend()
local Dense = Layer:extend()
local DenseBroadcast = Layer:extend()
local Softmax = Layer:extend()
local Embed = Layer:extend()
local LayerNorm = Layer:extend()
function Weights:init(weight_init)
self.weight_init = weight_init
end
function Weights:allocate(fan_in, fan_out)
self.size = prod(self.shape)
return allocate(self.size, self, function(t)
--print('initializing weights of size', self.size, 'with fans', fan_in, fan_out)
return self.weight_init(t, fan_in, fan_out)
end)
end
local counter = {}
function Layer:init(name)
assert(type(name) == "string")
counter[name] = (counter[name] or 0) + 1
self.name = name.."["..tostring(counter[name]).."]"
self.parents = {}
self.children = {}
self.weights = {}
--self.shape_in = nil
--self.shape_out = nil
end
function Layer:make_shape(parent)
if self.shape_in == nil then self.shape_in = parent.shape_out end
if self.shape_out == nil then self.shape_out = self.shape_in end
end
function Layer:feed(child)
assert(self.shape_out ~= nil, "missing output shape: "..self.name)
child:make_shape(self)
insert(self.children, child)
insert(child.parents, self)
return child
end
function Layer:forward()
error("Unimplemented.")
end
function Layer:forward_deterministic(...)
return self:forward(...)
end
function Layer:_new_weights(init)
local w = Weights(init)
insert(self.weights, w)
return w
end
function Layer:get_size()
local size = 0
for i, w in ipairs(self.weights) do size = size + prod(w.shape) end
return size
end
function Layer:init_weights()
for i, w in ipairs(self.weights) do
--print("allocating weights", i, "of", self.name)
for j, v in ipairs(w) do w[j] = nil end -- FIXME: HACK
w:allocate(prod(self.shape_in), prod(self.shape_out))
end
self:reset_cache()
end
function Layer:reset_cache(bs)
self.bs = bs
self.cache = cache(bs, self.shape_out)
end
function Layer:_propagate(edges, deterministic)
-- override this if you need multiple parents.
assert(#edges == 1, ("%s edges for node %s (expected 1)"):format(#edges, self.name))
if deterministic then
return self:forward_deterministic(edges[1])
else
return self:forward(edges[1])
end
end
function Layer:propagate(values, deterministic)
local edges = {}
for i, parent in ipairs(self.parents) do
if values[parent] ~= nil then
local X = values[parent]
insert(edges, X)
end
end
assert(#edges > 0, ("%s edges for node %s (expected >0)"):format(#edges, self.name))
local Y = self:_propagate(edges, deterministic)
return Y
end
function Input:init(shape)
Layer.init(self, "Input")
assert(type(shape) == 'table')
self.shape_in = shape
self.shape_out = shape
end
function Input:forward(X)
checkshape(X, self.shape_in)
return X
end
function Merge:init()
Layer.init(self, "Merge")
self.size = 0
self.shape_in = 0
end
function Merge:make_shape(parent)
self.size = self.size + prod(parent.shape_out)
self.shape_in = self.shape_in + 1 -- TODO: more robust.
self.shape_out = {self.size}
end
function Merge:_propagate(edges, deterministic)
assert(#edges == self.shape_in)
local bs = edges[1].shape[1]
if bs ~= self.bs then self:reset_cache(bs) end
local Y = self.cache
local yi = 1
for i, X in ipairs(edges) do
for _, x in ipairs(X) do
Y[yi] = x
yi = yi + 1
end
end
checkshape(Y, self.shape_out)
return Y
end
function Reshape:init(shape)
Layer.init(self, "Reshape")
self.size = 0
self.shape_out = shape
end
function Reshape:make_shape(parent)
self.shape_in = parent.shape_out
-- TODO: allow a single dummy dimension like numpy.
assert(prod(self.shape_in) == prod(self.shape_out),
"input shape does not fit into given shape.")
end
function Reshape:forward(X)
local bs = checkshape(X, self.shape_in)
if bs ~= self.bs then self:reset_cache(bs) end
local Y = self.cache
for i, v in ipairs(X) do Y[i] = v end
return Y
end
function Relu:init()
Layer.init(self, "Relu")
end
function Relu:forward(X)
local bs = checkshape(X, self.shape_in)
if bs ~= self.bs then self:reset_cache(bs) end
local Y = self.cache
for i = 1, #X do Y[i] = X[i] >= 0 and X[i] or 0 end
checkshape(Y, self.shape_out)
return Y
end
function Gelu:init()
Layer.init(self, "Gelu")
end
function Gelu:forward(X)
local bs = checkshape(X, self.shape_in)
if bs ~= self.bs then self:reset_cache(bs) end
local Y = self.cache
-- NOTE: approximate form of GELU exploiting similarities to sigmoid curve.
for i = 1, #X do
Y[i] = X[i] / (1 + exp(-1.704 * X[i]))
end
checkshape(Y, self.shape_out)
return Y
end
function Cos:init()
Layer.init(self, "Cos")
end
function Cos:forward(X)
local bs = checkshape(X, self.shape_in)
if bs ~= self.bs then self:reset_cache(bs) end
local Y = self.cache
for i = 1, #X do Y[i] = cos(X[i]) end
checkshape(Y, self.shape_out)
return Y
end
function Tanh:init()
Layer.init(self, "Tanh")
end
function Tanh:forward(X)
local bs = checkshape(X, self.shape_in)
if bs ~= self.bs then self:reset_cache(bs) end
local Y = self.cache
for i = 1, #X do Y[i] = tanh(X[i]) end
checkshape(Y, self.shape_out)
return Y
end
function Dense:init(dim, norm_in, biasing)
Layer.init(self, "Dense")
assert(type(dim) == "number")
self.dim = dim
self.shape_out = {dim}
self.norm_in = norm_in and true or false
self.biasing = biasing == nil or biasing
if self.norm_in then
self.coeffs = self:_new_weights(init_normal)
else
self.coeffs = self:_new_weights(init_he_normal)
end
if self.biasing then
self.biases = self:_new_weights(init_zeros)
end
self.c = 1.0
end
function Dense:make_shape(parent)
self.shape_in = parent.shape_out
self.coeffs.shape = {self.shape_in[#self.shape_in], self.dim}
if self.biasing then
self.biases.shape = {1, self.dim}
end
if self.norm_in then
self.c = 1 / sqrt(prod(self.shape_in))
end
end
function Dense:forward(X)
local bs = checkshape(X, self.shape_in)
if self.bs ~= bs then self:reset_cache(bs) end
local Y = self.cache
dot(X, self.coeffs, 2, 1, Y)
if self.biasing then
for i, v in ipairs(Y) do Y[i] = self.c * v + self.biases[i] end
elseif self.norm_in then
for i, v in ipairs(Y) do Y[i] = self.c * v end
end
checkshape(Y, self.shape_out)
return Y
end
function DenseBroadcast:init(dim, norm_in, biasing)
-- same as Dense but applies the same to every m of (m, n).
Layer.init(self, "DenseBroadcast")
assert(type(dim) == "number")
self.dim = dim
self.norm_in = norm_in and true or false
if self.norm_in then
self.coeffs = self:_new_weights(init_normal)
else
self.coeffs = self:_new_weights(init_he_normal)
end
if self.biasing then
self.biases = self:_new_weights(init_zeros)
end
self.c = 1.0
end
function DenseBroadcast:make_shape(parent)
self.shape_in = parent.shape_out
assert(#self.shape_in == 2)
self.shape_out = {self.shape_in[1], self.dim}
self.coeffs.shape = {self.shape_in[#self.shape_in], self.dim}
if self.biasing then
self.biases.shape = {1, self.dim}
end
if self.norm_in then
self.c = 1 / sqrt(prod(self.shape_in))
end
end
function DenseBroadcast:forward(X)
local bs = checkshape(X, self.shape_in)
if self.bs ~= bs then self:reset_cache(bs) end
local Y = self.cache
dot(X, self.coeffs, 3, 1, Y)
if self.biasing then
for i, v in ipairs(Y) do Y[i] = self.c * v + self.biases[(i - 1) % self.dim + 1] end
elseif self.norm_in then
for i, v in ipairs(Y) do Y[i] = self.c * v end
end
checkshape(Y, self.shape_out)
return Y
end
function Softmax:init()
Layer.init(self, "Softmax")
end
function Softmax:forward(X)
local bs = checkshape(X, self.shape_in)
if self.bs ~= bs then self:reset_cache(bs) end
local Y = self.cache
local alpha = -huge
local num = {} -- TODO: cache
local den = 0
for b = 1, X.shape[1] do
local l = X.shape[2]
local j = (b - 1) * l
for i = j+1, j+l do alpha = max(alpha, X[i]) end
for i = j+1, j+l do num[i] = exp(X[i] - alpha) end
for i = j+1, j+l do den = den + num[i] end
for i = j+1, j+l do Y[i] = num[i] / den end
end
checkshape(Y, self.shape_out)
return Y
end
function Embed:init(vocab, dim)
Layer.init(self, "Embed")
assert(type(vocab) == "number")
assert(type(dim) == "number")
self.vocab = vocab
self.dim = dim
self.lut = self:_new_weights(init_normal)
self.lut.shape = {self.vocab, self.dim}
end
function Embed:make_shape(parent)
self.shape_in = parent.shape_out
self.shape_out = {parent.shape_out[1] * self.dim}
end
function Embed:forward(X)
local bs = checkshape(X, self.shape_in)
if self.bs ~= bs then self:reset_cache(bs) end
local Y = self.cache
local yi = 0
for i, x in ipairs(X) do
local xi = x * self.dim
for j = 1, self.dim do
Y[yi+j] = self.lut[xi + j]
end
yi = yi + self.dim
end
checkshape(Y, self.shape_out)
return Y
end
function LayerNorm:init(eps)
Layer.init(self, "LayerNorm")
if eps == nil then eps = 1e-5 end
assert(type(eps) == "number")
self.eps = eps
end
function LayerNorm:forward(X)
local bs = checkshape(X, self.shape_in)
if self.bs ~= bs then self:reset_cache(bs) end
local Y = self.cache
local mean = 0
for i, v in ipairs(X) do
mean = mean + v / #X
end
local var = 0
for i, v in ipairs(X) do
local delta = v - mean
Y[i] = delta
var = var + delta * delta / #X
end
local std = sqrt(var + self.eps)
for i, v in ipairs(Y) do
Y[i] = v / std
end
return Y
end
function Model:init(nodes_in, nodes_out)
assert(#nodes_in > 0, #nodes_in)
assert(#nodes_out > 0, #nodes_out)
--if #nodes_in == 0 and type(nodes_in) == "table" then nodes_in = {nodes_in} end
--if #nodes_out == 0 and type(nodes_out) == "table" then nodes_out = {nodes_out} end
self.nodes_in = nodes_in
self.nodes_out = nodes_out
-- find all the used (inbetween) nodes in the graph.
self.nodes = traverse_all(self.nodes_in, self.nodes_out)
end
function Model:reset()
self.n_param = 0
for _, node in ipairs(self.nodes) do
print(node.name, node:get_size())
node:init_weights()
self.n_param = self.n_param + node:get_size()
end
end
function Model:forward(inputs)
local values = {}
local outputs = {}
for i, node in ipairs(self.nodes) do
--print(i, node.name)
if util.contains(self.nodes_in, node) then
local X = inputs[node]
assert(X ~= nil, ("missing input for node %s"):format(node.name))
assert(X.shape, ("missing shape for node %s"):format(node.name))
values[node] = node:_propagate({X})
else
values[node] = node:propagate(values)
end
if util.contains(self.nodes_out, node) then
outputs[node] = values[node]
end
end
return outputs
end
function Model:print()
print("digraph G {")
for _, parent in ipairs(self.nodes) do
if #parent.children then
for _, child in ipairs(parent.children) do
print('\t'..parent.name..'->'..child.name..';')
end
end
end
print('}')
end
function Model:collect()
-- return a flat array of all the weights in the graph.
-- if Lua had slices, we wouldn't need this. future library idea?
assert(self.n_param >= 0, self.n_param)
local W = zeros(self.n_param)
local i = 0
for _, node in ipairs(self.nodes) do
for _, w in ipairs(node.weights) do
for j, v in ipairs(w) do
W[i+j] = v
end
i = i + #w
end
end
return W
end
function Model:distribute(W)
-- inverse operation of collect().
assert(W ~= nil)
local i = 0
for _, node in ipairs(self.nodes) do
for _, w in ipairs(node.weights) do
for j, v in ipairs(w) do
w[j] = W[i+j]
end
i = i + #w
end
end
end
function Model:default_filename()
return ('network%07i.txt'):format(self.n_param)
end
function Model:save(fn)
local fn = fn or self:default_filename()
local f = open(fn, 'w')
if f == nil then error("Failed to save network to file "..fn) end
local W = self:collect()
for i, v in ipairs(W) do
f:write(v)
f:write('\n')
end
f:close()
end
function Model:load(fn)
local fn = fn or self:default_filename()
local f = open(fn, 'r')
if f == nil then
error("Failed to load network from file "..fn)
end
local W = zeros(self.n_param)
local i = 0
for line in f:lines() do
i = i + 1
local n = tonumber(line)
if n == nil then
error("Failed reading line "..tostring(i).." of file "..fn)
end
W[i] = n
end
f:close()
self:distribute(W)
end
return {
prod = prod,
uniform = uniform,
normal = normal,
zeros = zeros,
arange = arange,
allocate = allocate,
init_zeros = init_zeros,
init_he_uniform = init_he_uniform,
init_he_normal = init_he_normal,
reshape = reshape,
pp = pp,
ppi = ppi,
dot_mv = dot_mv,
dot = dot,
traverse = traverse,
traverse_all = traverse_all,
Weights = Weights,
Layer = Layer,
Model = Model,
Input = Input,
Merge = Merge,
Reshape = Reshape,
Relu = Relu,
Gelu = Gelu,
Cos = Cos,
Tanh = Tanh,
Dense = Dense,
DenseBroadcast = DenseBroadcast,
Softmax = Softmax,
Embed = Embed,
LayerNorm = LayerNorm,
}