Skip to content

testing code for ExpoLU, PowerLU, ParaLU, QuartLU but Relu is still BEST

Last updated on May 12, 2026

I wrote the code below line by line manually. Although I have 2 coding agents on my VS code I just used them for advice, and they did help and teache a lot and make me get hands on quickly. And Gemini help add some annotations for me before posting.

Below is the test code for ExpoLU, PowerLU, ParaLU, QuartLU and ReLU, and training data of 21 pairs of x and y is generated by me manually. Grok helped me find a bug on training data normalization, but with wrong normalization ExpoLU can work well too, haha.

Initially I use training row by row as code below, in which ExpoLU works well in like 512 features and 30 layers but ReLU cant work, but later I changed to batch training of 1024 features and 100 layers in which ReLU IS KING and my LUs cant work.

import torch
import torch.nn as nn

# Device Configuration
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
powerlist=[1] # power list including linear base (1) and nonlinear Polynomial expansion (>1)
acttype=1  #activation type: 1 is Expolu and 9 is Relu
n=512 #feature number
layers=30 # ffn layer number at least 2
lr=0.001/n #learning rate
rounds=400 #training rounds

# Dataset: 21 pairs of x and y
tdata1=torch.tensor([
    [-10,6],[-9,3],[-8,1],[-7,-2],[-6,-5],[-5,-2],[-4,1],[-3,3],[-2,5],[-1,2],[0,0],
    [10,6],[9,3],[8,1],[7,-2],[6,-5],[5,-2],[4,1],[3,3],[2,5],[1,2],
    ],
    dtype=torch.float32,
    device=device,
    )

# Global Statistics for Normalization
xmean=tdata1[:,:-1].mean()
xstd=tdata1[:,:-1].std()
ymean=tdata1[:,-1:].mean()
ystd=tdata1[:,-1:].std()

class activation:
    def __init__(self, type):

        def ExpoLU(x,a=1,b=1,p=2):  # acttype=1
            return torch.where(x<-b,torch.tensor(0.0,device=x.device),
                               torch.where(x<b,(1/a)*(x+b)**p,x+(1/a)*(2*b)**p-b))
        def PowerLU(x,xp=4): # acttype=2
            return torch.where(x<-1,torch.tensor(0.0,device=x.device),
                           torch.where(x<1,(x+1)**xp/(2**xp),x))
        def ParaLU(x, scope=0.1): # acttype=3
            return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
                           torch.where(x<scope,(x+scope)**2/(4*scope),x))
        def QuartLU(x, scope=0.1): # acttype=4
            return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
                           torch.where(x<(scope/3),(x+scope)**4*27/(256*scope**3),x))
        
        funcs={
            0:lambda x:x,
            1:ExpoLU,
            2:PowerLU,
            3:ParaLU,
            4:QuartLU,
            8:torch.sigmoid,
            9:torch.relu,            
        }
        self.act=funcs[type]

# Initialize global activation reference
act=activation(acttype).act       

class ffn(nn.Module):
    def __init__(self, power, layer, tdata):
        super().__init__()
        self.powerlist=power
        self.layernum=layer
        self.tdata=tdata
        self.act=activation(acttype).act
        
        maxpower=max(self.powerlist)
        self.W0=torch.randn(maxpower,len(tdata[0])-1,n,dtype=torch.float32,device=tdata.device) #torch: data.shape[1]-1
        self.W0.requires_grad=True
        self.B0=torch.randn(1,n,dtype=torch.float32,device=tdata.device)
        self.B0.requires_grad=True
        self.Wh=torch.randn(maxpower,self.layernum-1,n,n,dtype=torch.float32,device=tdata.device)
        self.Wh.requires_grad=True
        self.Bh=torch.randn(self.layernum-1,1,n,dtype=torch.float32,device=tdata.device)
        self.Bh.requires_grad=True
        self.Wo=torch.randn(maxpower,n,1,dtype=torch.float32,device=tdata.device)
        self.Wo.requires_grad=True
        self.Bo=torch.randn(1,1,dtype=torch.float32,device=tdata.device)
        self.Bo.requires_grad=True
    
    

    #Internal Layer Normalization: Resets signal to Mean 0, Std 1
    def norma(self,x):
        x=(x-x.mean())/x.std()
        return x

    #Global Dataset Normalization    
    def norma0(self,x,typeid):
        if typeid=="x":
            x=(x-xmean)/xstd
        if typeid=="y":
            x=(x-ymean)/ystd
        return x
      
    #Input Layer: Power Transformation + Double Normalization    
    def ffn0(self,x):
        firstpower=0
        
        for idx,power in enumerate(self.powerlist):
            if firstpower==0:
                xh=torch.sign(x)*abs(x**power)@self.W0[idx]+self.B0  
                firstpower=power
            else:
                xh=xh+torch.sign(x)*abs(x**power)@self.W0[idx]
            xh=self.norma(xh)
            xh=self.act(xh)
            xh=self.norma(xh)    
        return xh
    
    #Hidden Layers: Power Transformation + Double Normalization
    def ffnh(self,xh): #linep=1
        for laynum in range(self.layernum-1):
            firstpower=0
            for idx,power in enumerate(self.powerlist):
                if firstpower==0:
                    x=torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]+self.Bh[laynum]  
                    firstpower=power
                else:
                    x=xh+torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]
            xh=self.norma(x)
            xh=self.act(xh)
            xh=self.norma(xh)
        return xh
    
    #Output Layer: Final Projection
    def ffno(self,x):
        firstpower=0
        for idx,power in enumerate(self.powerlist):
            if firstpower==0:
                xh=torch.sign(x)*abs(x**power)@self.Wo[idx]+self.Bo  
                firstpower=power
            else:
                xh=xh+torch.sign(x)*abs(x**power)@self.Wo[idx]
        return xh
    
    def forward(self,xx):
        xh=self.ffn0(xx)
        xh=self.ffnh(xh)
        ho=self.ffno(xh)
        return ho




def training_1(inputdata): 
    doffn=ffn(powerlist, layers, inputdata)

    for i in range(rounds):
        printloss=0
        for item in inputdata:
            x=item[:-1]
            y=item[-1:]

            # Dataset-level Normalization
 #           x=doffn.norma0(x,"x")
 #           y=doffn.norma0(y,"y")

            # Forward Pass & MSE Loss
            ho=doffn.forward(x)
            loss=(ho-y)**2
            loss=loss.mean()

            printloss+=loss
            loss.backward()

            # Manual Gradient Descent (No Optimizer)
            with torch.no_grad():        
                for weight in [doffn.W0, doffn.B0, doffn.Wh, doffn.Bh, doffn.Wo, doffn.Bo]:
                    if weight.grad is not None:
                        weight -= lr * weight.grad
                        weight.grad.zero_()     

        print(f"round {i}, loss {printloss/len(inputdata):.12f}")

# run training
training_1(tdata1)
Published inUncategorized

Be First to Comment

Leave a Reply

Your email address will not be published. Required fields are marked *