# Stochastic Mirror Descent with projection on the l1 ball SMDproj <- function(a, b, init, iters = length(b), cost, instgrad, lambda,z =Inf) { ind<-sample(1:length(b),iters,replace=TRUE) a<-a[ind,] b<-b[ind] param <- data.frame(matrix(nrow = iters, ncol = length(init) + 1)) colnames(param) <- c(colnames(a), "Loss") x <- c(init) m <- x param[1, ] <- c(m, cost( m, a, b,lambda)) theta <- c(init) for (i in 2:iters) { eta <- 1/(lambda*i) # 1/i play with the learning rate theta <- theta - eta * instgrad(x, a[i,], b[i], lambda) x <- pib1(theta,z) m <- ((i-1)*m + x)/i param[i, ] <- c(m ,cost( m, a, b,lambda) ) } cat("Final cost: ", sprintf("%10.07f", param[nrow(param), ncol(param)]), "\n", sep = "") cat("Parameters:", as.numeric(param[nrow(param), 1:length(init) ]), sep = " ") param <- cbind(Iteration = 1:nrow(param), param) return(param) } # Stochastic ONS diagonal version with projection ONSproj <- function(a, b, init, iters = length(b), cost, instgrad, lambda,z ) { ind<-sample(1:length(b),iters,replace=TRUE) a<-a[ind,] b<-b[ind] param <- data.frame(matrix(nrow = iters, ncol = length(init) + 1)) colnames(param) <- c(colnames(a), "Loss") x <- c(init) m <- x gamm <- 1/2 *min(1/(8*z),1) A <- diag(rep(1/(gamm^2*(2*z)^2),length(init))) Ainv <- diag(1/rep(1/(gamm^2*(2*z)^2),length(init))) param[1, ] <- c(m, cost( m, a, b,lambda)) for (i in 2:iters) { instg <- as.numeric(instgrad(x, a[i,], b[i], lambda)) A <- A + instg%*%t(instg) Ainv <- Ainv - as.numeric(1/(1+t(instg)%*%Ainv%*%instg)) * Ainv%*%instg%*%t(instg)%*%Ainv y <- x - 1/gamm * Ainv %*% instg x <- sqrtm(Ainv * i) %*% pib1(sqrtm(A / i)%*%y,z) m <- ((i-1)*m + x)/i param[i, ] <- c(m,cost(m, a, b,lambda)) } cat("Final cost: ", sprintf("%10.07f", param[nrow(param), ncol(param)]), "\n", sep = "") cat("Parameters:", as.numeric(param[nrow(param), 1:length(init) ]), sep = " ") param <- cbind(Iteration = 1:nrow(param), param) return(param) } # Stochastic Exponentiated Gradient +_ SEGpm <- function(a, b, iters = length(b), cost, instgrad, lambda,z) { ind<-sample(1:length(b),iters,replace=TRUE) a<-a[ind,] b<-b[ind] param <- data.frame(matrix(nrow = iters, ncol = ncol(a) + 1)) colnames(param) <- c(colnames(a), "Loss") W <- rep(1,2*ncol(a)) w <- W/sum(W) x <- z*c(w[1:ncol(a)]-w[ncol(a)+1:ncol(a)]) m = x param[1, ] <- c(m, cost(m, a, b,lambda)) for (i in 2:iters) { eta <- 2*sqrt(log(2* ncol(a))/i) # Not the same learning rate than SGD or SMD instg <- instgrad( x, a[i,], b[i], lambda) W <- c(exp( - eta * instg ),exp( eta *instg))*W w <- W/sum(W) x <- z*c(w[1:ncol(a)]-w[ncol(a)+1:ncol(a)]) m <- ((i-1)*m + x)/i param[i, ] <- c( m ,cost(m, a, b,lambda)) } cat("Final cost: ", sprintf("%10.07f", param[nrow(a), ncol(a)]), "\n", sep = "") cat("Parameters:", as.numeric(param[nrow(a), 1:ncol(a) ]), sep = " ") param <- cbind(Iteration = 1:iters, param) return(param) } #Generate the train sample atrain <- data.frame(intercept= rep(1,n = 100), a = rnorm(n = 100) * 5, b = rnorm(n = 100) * 3 + 1, c = rnorm(n = 100) * 2 + 2, d= rnorm(n = 100) * 5) # Add column names and intercept colnames(atrain) <- c("intercept", "a", "b", "c" ,"d") atrain <- as.matrix(atrain) # Setting up a perfect relationship (linear problem) with +1 or -1 for classes xstar <- c(-20,2,3,4,0) btrain <- 2 * as.integer(atrain%*%xstar > 0) - 1 # Comparaison between ONS and Ada projected start_time <- Sys.time() set.seed(100) paramONSproj <- ONSproj(a = atrain, b = btrain, init = rep(0, 5), iters = 1000, # Play with the number of iterations cost = hingereg, instgrad = instgradreg, lambda = 1, # Play with the regularization parameter z=.5) set.seed(100) # Same seed! start_time <- Sys.time() set.seed(100) paramSEGpm <- SEGpm(a = atrain, b = btrain, iters = 1000, # Play with the number of iterations cost = hingereg, instgrad = instgradreg, lambda = 1, # Play with the regularization parameter z=.5) end_time <- Sys.time() paramSMDproj <- SMDproj(a = atrain, b = btrain, init = rep(0, 5), iters = 1000, # Play with the number of iterations cost = hingereg, instgrad = instgradreg, lambda = 1, # Play with the regularization parameter z=.5) end_time <- Sys.time() # Different trajectories plot.ts(paramONSproj[, 2:(ncol(paramONSproj) - 1)]) plot.ts(paramSMDproj[, 2:(ncol(paramSMDproj) - 1)]) # test # Explanatory variables and intercept in the test set atest <- data.frame(intercept= rep(1,n = 1000), a = rnorm(n = 1000) * 5, b = rnorm(n = 1000) * 3 + 1, c = rnorm(n = 1000) * 2 + 2, d = rnorm(n = 1000) * 5) atest <- as.matrix(atest) # test labels with +1 or -1 for classes btest <- 2 * as.integer(atest %*% xstar > 0) - 1 # How to analyse the convergence using a test set rateONSproj <- colMeans(btest*( atest %*% t(as.matrix(paramONSproj[, 2:(ncol(paramONSproj) - 1)])))>0) rateSEGpm <- colMeans(btest*( atest %*% t(as.matrix(paramSEGpm[, 2:(ncol(paramSEGpm) - 1)])))>0) rateSMDproj <- colMeans(btest*( atest %*% t(as.matrix(paramSMDproj[, 2:(ncol(paramSMDproj) - 1)])))>0) plot(log(paramONSproj$Iteration),log(1-rateONSproj),type ="l",col="blue") lines(log(paramSMDproj$Iteration),log(1-rateSMDproj)) lines(log(paramSEGpm$Iteration),log(1-rateSEGpm),col=2)