Skip to content

testing code – ExpoLU keeps converging to 1e-9 and Relu cant converge

I wrote the code below line by line manually. Although I have 2 coding agents on my VS code I just used them for advice, and they did helped and teached a lot and made me get hands on quickly. And Gemini help add some annotations for me before posting.

In the test code below, ExpoLU(a1b1p2) converged to 1e-9 in about 10 tests and ReLU cant converge at all in 4 or 5 tests. You can run it to see yourself.

Below is the test code for ExpoLU and ReLU, and training data of 21 pairs of x and y is generated by me manually. Grok helped me find a bug on training data normalization, but with wrong normalization ExpoLU can work well too, haha.

By the way I removes norma0 for input data’s initial normalization which doesnt help.

import torch
import torch.nn as nn

# Device Configuration
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
powerlist=[1] # power list including linear base (1) and nonlinear Polynomial expansion (>1)
acttype=1  #activation type: 1 is Expolu and 9 is Relu
n=512 #feature number
layers=30 # ffn layer number at least 2
lr=0.001/n #learning rate
rounds=400 #training rounds

# Dataset: 21 pairs of x and y
tdata1=torch.tensor([
    [-10,6],[-9,3],[-8,1],[-7,-2],[-6,-5],[-5,-2],[-4,1],[-3,3],[-2,5],[-1,2],[0,0],
    [10,6],[9,3],[8,1],[7,-2],[6,-5],[5,-2],[4,1],[3,3],[2,5],[1,2],
    ],
    dtype=torch.float32,
    device=device,
    )

# Global Statistics for Normalization
xmean=tdata1[:,:-1].mean()
xstd=tdata1[:,:-1].std()
ymean=tdata1[:,-1:].mean()
ystd=tdata1[:,-1:].std()

class activation:
    def __init__(self, type):

        def ExpoLU(x,a=1,b=1,p=2):  # acttype=1
            return torch.where(x<-b,torch.tensor(0.0,device=x.device),
                               torch.where(x<b,(1/a)*(x+b)**p,x+(1/a)*(2*b)**p-b))
        def PowerLU(x,xp=4): # acttype=2
            return torch.where(x<-1,torch.tensor(0.0,device=x.device),
                           torch.where(x<1,(x+1)**xp/(2**xp),x))
        def ParaLU(x, scope=0.1): # acttype=3
            return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
                           torch.where(x<scope,(x+scope)**2/(4*scope),x))
        def QuartLU(x, scope=0.1): # acttype=4
            return torch.where(x<-scope,torch.tensor(0.0,device=x.device),
                           torch.where(x<(scope/3),(x+scope)**4*27/(256*scope**3),x))
        
        funcs={
            0:lambda x:x,
            1:ExpoLU,
            2:PowerLU,
            3:ParaLU,
            4:QuartLU,
            8:torch.sigmoid,
            9:torch.relu,            
        }
        self.act=funcs[type]

# Initialize global activation reference
act=activation(acttype).act       

class ffn(nn.Module):
    def __init__(self, power, layer, tdata):
        super().__init__()
        self.powerlist=power
        self.layernum=layer
        self.tdata=tdata
        self.act=activation(acttype).act
        
        maxpower=max(self.powerlist)
        self.W0=torch.randn(maxpower,len(tdata[0])-1,n,dtype=torch.float32,device=tdata.device) #torch: data.shape[1]-1
        self.W0.requires_grad=True
        self.B0=torch.randn(1,n,dtype=torch.float32,device=tdata.device)
        self.B0.requires_grad=True
        self.Wh=torch.randn(maxpower,self.layernum-1,n,n,dtype=torch.float32,device=tdata.device)
        self.Wh.requires_grad=True
        self.Bh=torch.randn(self.layernum-1,1,n,dtype=torch.float32,device=tdata.device)
        self.Bh.requires_grad=True
        self.Wo=torch.randn(maxpower,n,1,dtype=torch.float32,device=tdata.device)
        self.Wo.requires_grad=True
        self.Bo=torch.randn(1,1,dtype=torch.float32,device=tdata.device)
        self.Bo.requires_grad=True
    
    

    #Internal Layer Normalization: Resets signal to Mean 0, Std 1
    def norma(self,x):
        x=(x-x.mean())/x.std()
        return x

    #Global Dataset Normalization    
    def norma0(self,x,typeid):
        if typeid=="x":
            x=(x-xmean)/xstd
        if typeid=="y":
            x=(x-ymean)/ystd
        return x
      
    #Input Layer: Power Transformation + Double Normalization    
    def ffn0(self,x):
        firstpower=0
        
        for idx,power in enumerate(self.powerlist):
            if firstpower==0:
                xh=torch.sign(x)*abs(x**power)@self.W0[idx]+self.B0  
                firstpower=power
            else:
                xh=xh+torch.sign(x)*abs(x**power)@self.W0[idx]
            xh=self.norma(xh)
            xh=self.act(xh)
            xh=self.norma(xh)    
        return xh
    
    #Hidden Layers: Power Transformation + Double Normalization
    def ffnh(self,xh): #linep=1
        for laynum in range(self.layernum-1):
            firstpower=0
            for idx,power in enumerate(self.powerlist):
                if firstpower==0:
                    x=torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]+self.Bh[laynum]  
                    firstpower=power
                else:
                    x=xh+torch.sign(xh)*abs(xh**power)@self.Wh[idx,laynum]
            xh=self.norma(x)
            xh=self.act(xh)
            xh=self.norma(xh)
        return xh
    
    #Output Layer: Final Projection
    def ffno(self,x):
        firstpower=0
        for idx,power in enumerate(self.powerlist):
            if firstpower==0:
                xh=torch.sign(x)*abs(x**power)@self.Wo[idx]+self.Bo  
                firstpower=power
            else:
                xh=xh+torch.sign(x)*abs(x**power)@self.Wo[idx]
        return xh
    
    def forward(self,xx):
        xh=self.ffn0(xx)
        xh=self.ffnh(xh)
        ho=self.ffno(xh)
        return ho




def training_1(inputdata): 
    doffn=ffn(powerlist, layers, inputdata)

    for i in range(rounds):
        printloss=0
        for item in inputdata:
            x=item[:-1]
            y=item[-1:]

            # Dataset-level Normalization
 #           x=doffn.norma0(x,"x")
 #           y=doffn.norma0(y,"y")

            # Forward Pass & MSE Loss
            ho=doffn.forward(x)
            loss=(ho-y)**2
            loss=loss.mean()

            printloss+=loss
            loss.backward()

            # Manual Gradient Descent (No Optimizer)
            with torch.no_grad():        
                for weight in [doffn.W0, doffn.B0, doffn.Wh, doffn.Bh, doffn.Wo, doffn.Bo]:
                    if weight.grad is not None:
                        weight -= lr * weight.grad
                        weight.grad.zero_()     

        print(f"round {i}, loss {printloss/len(inputdata):.12f}")

# run training
training_1(tdata1)
Published inUncategorized

Be First to Comment

Leave a Reply

Your email address will not be published. Required fields are marked *