Last updated on May 12, 2026
I wrote the code below line by line manually. Although I have 2 coding agents on my VS code I just used them for advice, and they did help and teache a lot and make me get hands on quickly. And Gemini help add some annotations for me before posting.
Below is the test code for ExpoLU, PowerLU, ParaLU, QuartLU and ReLU, and training data of 21 pairs of x and y is generated by me manually. Grok helped me find a bug on training data normalization, but with wrong normalization ExpoLU can work well too, haha.
Initially I use training row by row as code below, in which ExpoLU works well in like 512 features and 30 layers but ReLU cant work, but later I changed to batch training of 1024 features and 100 layers in which ReLU IS KING and my LUs cant work.
import torch
import torch.nn as nn
# Device Configuration
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Hyperparameters
powerlist=[1] # power list including linear base (1) and nonlinear Polynomial expansion (>1)
acttype=1 #activation type: 1 is Expolu and 9 is Relu
n=512 #feature number
layers=30 # ffn layer number at least 2
lr=0.001/n #learning rate
rounds=400 #training rounds
# Dataset: 21 pairs of x and y
tdata1=torch.tensor([
[-10,6],[-9,3],[-8,1],[-7,-2],[-6,-5],[-5,-2],[-4,1],[-3,3],[-2,5],[-1,2],[0,0],
[10,6],[9,3],[8,1],[7,-2],[6,-5],[5,-2],[4,1],[3,3],[2,5],[1,2],
],
dtype=torch.float32,
device=device,
)
# Global Statistics for Normalization
xmean=tdata1[:,:-1].mean()
xstd=tdata1[:,:-1].std()
ymean=tdata1[:,-1:].mean()
ystd=tdata1[:,-1:].std()
class activation:
def __init__(self, type):
def ExpoLU(x,a=1,b=1,p=2): # acttype=1
return torch.where(x<-b,torch.tensor(0.0,device=x.device),
torch.where(x<b,(1/a)*(x+b)**p,x+(1/a)*(2*b)**p-b))
def PowerLU(x,xp=4): # acttype=2
return torch.where(x<-1,torch.tensor(0.0,device=x.device),
torch.where(x<1,(x+1)**xp/(2**xp),x))
def ParaLU(x, scope=0.1): # acttype=3
return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
torch.where(x<scope,(x+scope)**2/(4*scope),x))
def QuartLU(x, scope=0.1): # acttype=4
return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
torch.where(x<(scope/3),(x+scope)**4*27/(256*scope**3),x))
funcs={
0:lambda x:x,
1:ExpoLU,
2:PowerLU,
3:ParaLU,
4:QuartLU,
8:torch.sigmoid,
9:torch.relu,
}
self.act=funcs[type]
# Initialize global activation reference
act=activation(acttype).act
class ffn(nn.Module):
def __init__(self, power, layer, tdata):
super().__init__()
self.powerlist=power
self.layernum=layer
self.tdata=tdata
self.act=activation(acttype).act
maxpower=max(self.powerlist)
self.W0=torch.randn(maxpower,len(tdata[0])-1,n,dtype=torch.float32,device=tdata.device) #torch: data.shape[1]-1
self.W0.requires_grad=True
self.B0=torch.randn(1,n,dtype=torch.float32,device=tdata.device)
self.B0.requires_grad=True
self.Wh=torch.randn(maxpower,self.layernum-1,n,n,dtype=torch.float32,device=tdata.device)
self.Wh.requires_grad=True
self.Bh=torch.randn(self.layernum-1,1,n,dtype=torch.float32,device=tdata.device)
self.Bh.requires_grad=True
self.Wo=torch.randn(maxpower,n,1,dtype=torch.float32,device=tdata.device)
self.Wo.requires_grad=True
self.Bo=torch.randn(1,1,dtype=torch.float32,device=tdata.device)
self.Bo.requires_grad=True
#Internal Layer Normalization: Resets signal to Mean 0, Std 1
def norma(self,x):
x=(x-x.mean())/x.std()
return x
#Global Dataset Normalization
def norma0(self,x,typeid):
if typeid=="x":
x=(x-xmean)/xstd
if typeid=="y":
x=(x-ymean)/ystd
return x
#Input Layer: Power Transformation + Double Normalization
def ffn0(self,x):
firstpower=0
for idx,power in enumerate(self.powerlist):
if firstpower==0:
xh=torch.sign(x)*abs(x**power)@self.W0[idx]+self.B0
firstpower=power
else:
xh=xh+torch.sign(x)*abs(x**power)@self.W0[idx]
xh=self.norma(xh)
xh=self.act(xh)
xh=self.norma(xh)
return xh
#Hidden Layers: Power Transformation + Double Normalization
def ffnh(self,xh): #linep=1
for laynum in range(self.layernum-1):
firstpower=0
for idx,power in enumerate(self.powerlist):
if firstpower==0:
x=torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]+self.Bh[laynum]
firstpower=power
else:
x=xh+torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]
xh=self.norma(x)
xh=self.act(xh)
xh=self.norma(xh)
return xh
#Output Layer: Final Projection
def ffno(self,x):
firstpower=0
for idx,power in enumerate(self.powerlist):
if firstpower==0:
xh=torch.sign(x)*abs(x**power)@self.Wo[idx]+self.Bo
firstpower=power
else:
xh=xh+torch.sign(x)*abs(x**power)@self.Wo[idx]
return xh
def forward(self,xx):
xh=self.ffn0(xx)
xh=self.ffnh(xh)
ho=self.ffno(xh)
return ho
def training_1(inputdata):
doffn=ffn(powerlist, layers, inputdata)
for i in range(rounds):
printloss=0
for item in inputdata:
x=item[:-1]
y=item[-1:]
# Dataset-level Normalization
# x=doffn.norma0(x,"x")
# y=doffn.norma0(y,"y")
# Forward Pass & MSE Loss
ho=doffn.forward(x)
loss=(ho-y)**2
loss=loss.mean()
printloss+=loss
loss.backward()
# Manual Gradient Descent (No Optimizer)
with torch.no_grad():
for weight in [doffn.W0, doffn.B0, doffn.Wh, doffn.Bh, doffn.Wo, doffn.Bo]:
if weight.grad is not None:
weight -= lr * weight.grad
weight.grad.zero_()
print(f"round {i}, loss {printloss/len(inputdata):.12f}")
# run training
training_1(tdata1)
Be First to Comment