I wrote the code below line by line manually. Although I have 2 coding agents on my VS code I just used them for advice, and they did helped and teached a lot and made me get hands on quickly. And Gemini help add some annotations for me before posting.
In the test code below, ExpoLU(a1b1p2) converged to 1e-9 in about 10 tests and ReLU cant converge at all in 4 or 5 tests. You can run it to see yourself.
Below is the test code for ExpoLU and ReLU, and training data of 21 pairs of x and y is generated by me manually. Grok helped me find a bug on training data normalization, but with wrong normalization ExpoLU can work well too, haha.
By the way I removes norma0 for input data’s initial normalization which doesnt help.
import torch
import torch.nn as nn
# Device Configuration
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Hyperparameters
powerlist=[1] # power list including linear base (1) and nonlinear Polynomial expansion (>1)
acttype=1 #activation type: 1 is Expolu and 9 is Relu
n=512 #feature number
layers=30 # ffn layer number at least 2
lr=0.001/n #learning rate
rounds=400 #training rounds
# Dataset: 21 pairs of x and y
tdata1=torch.tensor([
[-10,6],[-9,3],[-8,1],[-7,-2],[-6,-5],[-5,-2],[-4,1],[-3,3],[-2,5],[-1,2],[0,0],
[10,6],[9,3],[8,1],[7,-2],[6,-5],[5,-2],[4,1],[3,3],[2,5],[1,2],
],
dtype=torch.float32,
device=device,
)
# Global Statistics for Normalization
xmean=tdata1[:,:-1].mean()
xstd=tdata1[:,:-1].std()
ymean=tdata1[:,-1:].mean()
ystd=tdata1[:,-1:].std()
class activation:
def __init__(self, type):
def ExpoLU(x,a=1,b=1,p=2): # acttype=1
return torch.where(x<-b,torch.tensor(0.0,device=x.device),
torch.where(x<b,(1/a)*(x+b)**p,x+(1/a)*(2*b)**p-b))
def PowerLU(x,xp=4): # acttype=2
return torch.where(x<-1,torch.tensor(0.0,device=x.device),
torch.where(x<1,(x+1)**xp/(2**xp),x))
def ParaLU(x, scope=0.1): # acttype=3
return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
torch.where(x<scope,(x+scope)**2/(4*scope),x))
def QuartLU(x, scope=0.1): # acttype=4
return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
torch.where(x<(scope/3),(x+scope)**4*27/(256*scope**3),x))
funcs={
0:lambda x:x,
1:ExpoLU,
2:PowerLU,
3:ParaLU,
4:QuartLU,
8:torch.sigmoid,
9:torch.relu,
}
self.act=funcs[type]
# Initialize global activation reference
act=activation(acttype).act
class ffn(nn.Module):
def __init__(self, power, layer, tdata):
super().__init__()
self.powerlist=power
self.layernum=layer
self.tdata=tdata
self.act=activation(acttype).act
maxpower=max(self.powerlist)
self.W0=torch.randn(maxpower,len(tdata[0])-1,n,dtype=torch.float32,device=tdata.device) #torch: data.shape[1]-1
self.W0.requires_grad=True
self.B0=torch.randn(1,n,dtype=torch.float32,device=tdata.device)
self.B0.requires_grad=True
self.Wh=torch.randn(maxpower,self.layernum-1,n,n,dtype=torch.float32,device=tdata.device)
self.Wh.requires_grad=True
self.Bh=torch.randn(self.layernum-1,1,n,dtype=torch.float32,device=tdata.device)
self.Bh.requires_grad=True
self.Wo=torch.randn(maxpower,n,1,dtype=torch.float32,device=tdata.device)
self.Wo.requires_grad=True
self.Bo=torch.randn(1,1,dtype=torch.float32,device=tdata.device)
self.Bo.requires_grad=True
#Internal Layer Normalization: Resets signal to Mean 0, Std 1
def norma(self,x):
x=(x-x.mean())/x.std()
return x
#Global Dataset Normalization
def norma0(self,x,typeid):
if typeid=="x":
x=(x-xmean)/xstd
if typeid=="y":
x=(x-ymean)/ystd
return x
#Input Layer: Power Transformation + Double Normalization
def ffn0(self,x):
firstpower=0
for idx,power in enumerate(self.powerlist):
if firstpower==0:
xh=torch.sign(x)*abs(x**power)@self.W0[idx]+self.B0
firstpower=power
else:
xh=xh+torch.sign(x)*abs(x**power)@self.W0[idx]
xh=self.norma(xh)
xh=self.act(xh)
xh=self.norma(xh)
return xh
#Hidden Layers: Power Transformation + Double Normalization
def ffnh(self,xh): #linep=1
for laynum in range(self.layernum-1):
firstpower=0
for idx,power in enumerate(self.powerlist):
if firstpower==0:
x=torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]+self.Bh[laynum]
firstpower=power
else:
x=xh+torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]
xh=self.norma(x)
xh=self.act(xh)
xh=self.norma(xh)
return xh
#Output Layer: Final Projection
def ffno(self,x):
firstpower=0
for idx,power in enumerate(self.powerlist):
if firstpower==0:
xh=torch.sign(x)*abs(x**power)@self.Wo[idx]+self.Bo
firstpower=power
else:
xh=xh+torch.sign(x)*abs(x**power)@self.Wo[idx]
return xh
def forward(self,xx):
xh=self.ffn0(xx)
xh=self.ffnh(xh)
ho=self.ffno(xh)
return ho
def training_1(inputdata):
doffn=ffn(powerlist, layers, inputdata)
for i in range(rounds):
printloss=0
for item in inputdata:
x=item[:-1]
y=item[-1:]
# Dataset-level Normalization
# x=doffn.norma0(x,"x")
# y=doffn.norma0(y,"y")
# Forward Pass & MSE Loss
ho=doffn.forward(x)
loss=(ho-y)**2
loss=loss.mean()
printloss+=loss
loss.backward()
# Manual Gradient Descent (No Optimizer)
with torch.no_grad():
for weight in [doffn.W0, doffn.B0, doffn.Wh, doffn.Bh, doffn.Wo, doffn.Bo]:
if weight.grad is not None:
weight -= lr * weight.grad
weight.grad.zero_()
print(f"round {i}, loss {printloss/len(inputdata):.12f}")
# run training
training_1(tdata1)
Be First to Comment