From 2331f40071a0bbb2382436c20b74f8973b143cae Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Fri, 14 Nov 2025 18:33:51 -0500 Subject: [PATCH 01/20] New feat: GPU precondition --- .gitignore | 4 + internal/internal_types.h | 7 +- internal/preconditioner.h | 2 +- internal/utils.h | 4 + src/preconditioner.cu | 489 ++++++++++++++++++++++++++++++++++++++ src/solver.cu | 370 ++++++++++------------------ src/utils.cu | 34 +-- 7 files changed, 648 insertions(+), 262 deletions(-) create mode 100644 src/preconditioner.cu diff --git a/.gitignore b/.gitignore index 866c33f..6794dcf 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,7 @@ test/* /_b *.whl *.pyc + + +*.txt +*.sh \ No newline at end of file diff --git a/internal/internal_types.h b/internal/internal_types.h index 1f42582..2022c33 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -28,6 +28,7 @@ typedef struct int num_nonzeros; int *row_ptr; int *col_ind; + int *row_ind; double *val; } cu_sparse_matrix_csr_t; @@ -46,6 +47,7 @@ typedef struct int num_blocks_primal; int num_blocks_dual; int num_blocks_primal_dual; + int num_blocks_nnz; double objective_vector_norm; double constraint_bound_norm; double *constraint_lower_bound_finite_val; @@ -74,8 +76,6 @@ typedef struct double *constraint_rescaling; double *variable_rescaling; - double constraint_bound_rescaling; - double objective_vector_rescaling; double *primal_slack; double *dual_slack; double rescaling_time_sec; @@ -129,10 +129,7 @@ typedef struct typedef struct { - lp_problem_t *scaled_problem; double *con_rescale; double *var_rescale; - double con_bound_rescale; - double obj_vec_rescale; double rescaling_time_sec; } rescale_info_t; diff --git a/internal/preconditioner.h b/internal/preconditioner.h index 8feb664..71a5c2b 100644 --- a/internal/preconditioner.h +++ b/internal/preconditioner.h @@ -25,7 +25,7 @@ extern "C" rescale_info_t *rescale_problem( const pdhg_parameters_t *params, - const lp_problem_t *original_problem); + pdhg_solver_state_t *state); #ifdef __cplusplus } diff --git a/internal/utils.h b/internal/utils.h index 1b6b7c9..775459f 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -122,6 +122,7 @@ extern "C" int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals, int *nnz_out); +<<<<<<< HEAD void check_feas_polishing_termination_criteria( pdhg_solver_state_t *solver_state, const termination_criteria_t *criteria, @@ -138,6 +139,9 @@ extern "C" void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm); void set_default_parameters(pdhg_parameters_t *params); +======= + int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz); +>>>>>>> 2a31c99 (New feat: GPU precondition) #ifdef __cplusplus } diff --git a/src/preconditioner.cu b/src/preconditioner.cu new file mode 100644 index 0000000..e197905 --- /dev/null +++ b/src/preconditioner.cu @@ -0,0 +1,489 @@ +/* +Copyright 2025 Haihao Lu + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "preconditioner.h" +#include "utils.h" +#include +#include +#include +#include +#include +#include + +#define SCALING_EPSILON 1e-12 + +__global__ void scale_variables_kernel(double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + const double* __restrict__ D, + const double* __restrict__ invD, + int n); +__global__ void scale_constraints_kernel(double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + const double* __restrict__ E, + const double* __restrict__ invE, + int m); +__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, + const int* __restrict__ col_ind, + double* __restrict__ vals, + const double* __restrict__ invD, + const double* __restrict__ invE, + int nnz); +__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double* __restrict__ out_max); +__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + unsigned long long* __restrict__ out_max_bits); +__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, + double* __restrict__ out_val, + int n); +__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double degree, + double* __restrict__ out_sum); +__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + double degree, + double* __restrict__ out_sum); +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + int n); +static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); +static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, + double *E, double *D, double *invE, double *invD); +static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, + double *E, double *D, double *invE, double *invD); +static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info, + double *E, double *D, double *invE, double *invD); + +__global__ void scale_variables_kernel(double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + const double* __restrict__ D, + const double* __restrict__ invD, + int n) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + double dj = D[j]; + double inv_dj = invD[j]; + c[j] *= inv_dj; + var_lb[j] *= dj; + var_ub[j] *= dj; + var_lb_finite[j] *= dj; + var_ub_finite[j] *= dj; + primal_start[j] *= dj; +} + +__global__ void scale_constraints_kernel(double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + const double* __restrict__ E, + const double* __restrict__ invE, + int m) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= m) return; + double inv_ei = invE[i]; + double ei = E[i]; + con_lb[i] *= inv_ei; + con_ub[i] *= inv_ei; + con_lb_finite[i] *= inv_ei; + con_ub_finite[i] *= inv_ei; + dual_start[i] *= ei; +} + +__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, + const int* __restrict__ col_ind, + double* __restrict__ vals, + const double* __restrict__ invD, + const double* __restrict__ invE, + int nnz) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; + k < nnz; + k += gridDim.x * blockDim.x) + { + int i = row_ids[k]; + int j = col_ind[k]; + vals[k] *= invD[j] * invE[i]; + } +} + +__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double* __restrict__ out_max) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double m = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + if (v > m) m = v; + } + out_max[i] = m; +} + +__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + unsigned long long* __restrict__ out_max_bits) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { + int j = col_ind[k]; + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + unsigned long long bits = __double_as_longlong(v); + atomicMax(&out_max_bits[j], bits); + } +} + +__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, + double* __restrict__ out_val, + int n) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) { + out_val[i] = __longlong_as_double(in_bits[i]); + } +} + +__device__ __forceinline__ double pow_fast(double v, double p) { + if (p == 2.0) return v * v; + if (p == 1.0) return v; + if (p == 0.5) return sqrt(v); + return pow(v, p); +} + +__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double degree, + double* __restrict__ out_sum) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double acc = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + acc += pow_fast(v, degree); + } + out_sum[i] = acc; +} + +__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, + const double* __restrict__ vals, + int nnz, + double degree, + double* __restrict__ out_sum) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { + int j = col_ind[k]; + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + double t = pow_fast(v, degree); + atomicAdd(&out_sum[j], t); + } +} + +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + int n) +{ + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) + { + double v = x[t]; + double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); + cum[t] *= s; + x[t] = s; + inv_x[t] = 1.0 / s; + } +} + +__global__ void reduce_bound_norm_sq_atomic( + const double* __restrict__ L, + const double* __restrict__ U, + int m, + double* __restrict__ out_sum) +{ + double acc = 0.0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { + double Li = L[i], Ui = U[i]; + bool fL = isfinite(Li), fU = isfinite(Ui); + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; + if (fU) acc += Ui * Ui; + } + atomicAdd(out_sum, acc); +} + +static void scale_problem( + pdhg_solver_state_t *state, + double *E, + double *D, + double *invE, + double *invD) +{ + int n_vars = state->num_variables; + int n_cons = state->num_constraints; + + scale_variables_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->objective_vector, + state->variable_lower_bound, + state->variable_upper_bound, + state->variable_lower_bound_finite_val, + state->variable_upper_bound_finite_val, + state->initial_primal_solution, + D, + invD, + n_vars); + + scale_constraints_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->constraint_lower_bound_finite_val, + state->constraint_upper_bound_finite_val, + state->initial_dual_solution, + E, + invE, + n_cons); + + csr_scale_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ind, + state->constraint_matrix->col_ind, + state->constraint_matrix->val, + invD, + invE, + state->constraint_matrix->num_nonzeros); +} + +static void ruiz_rescaling( + pdhg_solver_state_t *state, + int num_iterations, + rescale_info_t *rescale_info, + double *E, + double *D, + double *invE, + double *invD) +{ + const int n_cons = state->num_constraints; + const int n_vars = state->num_variables; + const int nnz = state->constraint_matrix->num_nonzeros; + + unsigned long long *D_bits=nullptr; + CUDA_CHECK(cudaMalloc(&D_bits, n_vars*sizeof(unsigned long long))); + + for (int iter = 0; iter < num_iterations; ++iter) + { + csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, + state->constraint_matrix->val, + n_cons, + E); + clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + E, + invE, + rescale_info->con_rescale, + n_cons); + + CUDA_CHECK(cudaMemset(D_bits, 0, n_vars*sizeof(unsigned long long))); + csr_col_absmax_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->col_ind, + state->constraint_matrix->val, + nnz, + D_bits); + u64bits_to_double<<num_blocks_primal, THREADS_PER_BLOCK>>>(D_bits, D, n_vars); + clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + D, + invD, + rescale_info->var_rescale, + n_vars); + + scale_problem(state, E, D, invE, invD); + } + + CUDA_CHECK(cudaFree(D_bits)); +} + +static void pock_chambolle_rescaling( + pdhg_solver_state_t *state, + const double alpha, + rescale_info_t *rescale_info, + double *E, + double *D, + double *invE, + double *invD) +{ + const int n_cons = state->num_constraints; + const int n_vars = state->num_variables; + const int nnz = state->constraint_matrix->num_nonzeros; + + csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, + state->constraint_matrix->val, + n_cons, + alpha, + E); + clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + E, + invE, + rescale_info->con_rescale, + n_cons); + + CUDA_CHECK(cudaMemset(D, 0, n_vars*sizeof(double))); + csr_col_powsum_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->col_ind, + state->constraint_matrix->val, + nnz, + 2.0 - alpha, + D); + clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + D, + invD, + rescale_info->var_rescale, + n_vars); + + scale_problem(state, E, D, invE, invD); + +} + +static void bound_objective_rescaling( + pdhg_solver_state_t *state, + rescale_info_t *rescale_info, + double *E, + double *D, + double *invE, + double *invD + ) +{ + const int n_cons = state->num_constraints; + const int n_vars = state->num_variables; + + double *bnd_norm_sq_cuda = nullptr; + CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double))); + CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double))); + reduce_bound_norm_sq_atomic<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + n_cons, + bnd_norm_sq_cuda); + + double bnd_norm_sq = 0.0; + CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(bnd_norm_sq_cuda)); + double bnd_norm = sqrt(bnd_norm_sq); + + double obj_norm = 0.0; + CUBLAS_CHECK(cublasDnrm2(state->blas_handle, + state->num_variables, + state->objective_vector, 1, + &obj_norm)); + + const double E_const = bnd_norm + 1.0; + const double D_const = obj_norm + 1.0; + { + std::vector h1(n_cons,E_const), h2(n_vars,D_const); + CUDA_CHECK(cudaMemcpy(E, h1.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(D, h2.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); + std::vector h3(n_cons,1/E_const), h4(n_vars,1/D_const); + CUDA_CHECK(cudaMemcpy(invE, h3.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(invD, h4.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); + } + + CUBLAS_CHECK(cublasDscal(state->blas_handle, + n_cons, + &E_const, + rescale_info->con_rescale, + 1)); + CUBLAS_CHECK(cublasDscal(state->blas_handle, + n_vars, + &D_const, + rescale_info->var_rescale, + 1)); + + scale_problem(state, E, D, invE, invD); +} + +rescale_info_t *rescale_problem( + const pdhg_parameters_t *params, + pdhg_solver_state_t *state) +{ + int n_vars = state->num_variables; + int n_cons = state->num_constraints; + + clock_t start_rescaling = clock(); + rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); + rescale_info->con_rescale = nullptr; + rescale_info->var_rescale = nullptr; + CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double))); + { + std::vector h1(n_cons,1.0), h2(n_vars,1.0); + CUDA_CHECK(cudaMemcpy(rescale_info->con_rescale, h1.data(), + n_cons*sizeof(double), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(rescale_info->var_rescale, h2.data(), + n_vars*sizeof(double), cudaMemcpyHostToDevice)); + } + + double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr; + CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double))); + CUDA_CHECK(cudaMalloc(&D, n_vars*sizeof(double))); + CUDA_CHECK(cudaMalloc(&invE, n_cons*sizeof(double))); + CUDA_CHECK(cudaMalloc(&invD, n_vars*sizeof(double))); + + if (params->l_inf_ruiz_iterations > 0) + { + ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD); + } + if (params->has_pock_chambolle_alpha) + { + pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); + } + if (params->bound_objective_rescaling) + { + bound_objective_rescaling(state, rescale_info, E, D, invE, invD); + } + + rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; + + CUDA_CHECK(cudaFree(E)); + CUDA_CHECK(cudaFree(D)); + CUDA_CHECK(cudaFree(invE)); + CUDA_CHECK(cudaFree(invD)); + + return rescale_info; +} \ No newline at end of file diff --git a/src/solver.cu b/src/solver.cu index 3e61fad..a88a25c 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -29,52 +29,40 @@ limitations under the License. #include __global__ void compute_next_pdhg_primal_solution_kernel( - const double *current_primal, double *reflected_primal, - const double *dual_product, const double *objective, const double *var_lb, - const double *var_ub, int n, double step_size); + const double *current_primal, double *reflected_primal, const double *dual_product, + const double *objective, const double *var_lb, const double *var_ub, + int n, double step_size); __global__ void compute_next_pdhg_primal_solution_major_kernel( const double *current_primal, double *pdhg_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, const double *var_ub, int n, double step_size, double *dual_slack); __global__ void compute_next_pdhg_dual_solution_kernel( - const double *current_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, - const double *const_ub, int n, double step_size); + const double *current_dual, double *reflected_dual, const double *primal_product, + const double *const_lb, const double *const_ub, int n, double step_size); __global__ void compute_next_pdhg_dual_solution_major_kernel( const double *current_dual, double *pdhg_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, - const double *const_ub, int n, double step_size); -__global__ void -halpern_update_kernel(const double *initial_primal, double *current_primal, - const double *reflected_primal, - const double *initial_dual, double *current_dual, - const double *reflected_dual, int n_vars, int n_cons, - double weight, double reflection_coeff); -__global__ void rescale_solution_kernel(double *primal_solution, - double *dual_solution, - const double *variable_rescaling, - const double *constraint_rescaling, - const double objective_vector_rescaling, - const double constraint_bound_rescaling, - int n_vars, int n_cons); + const double *primal_product, const double *const_lb, const double *const_ub, + int n, double step_size); +__global__ void halpern_update_kernel( + const double *initial_primal, double *current_primal, const double *reflected_primal, + const double *initial_dual, double *current_dual, const double *reflected_dual, + int n_vars, int n_cons, double weight, double reflection_coeff); +__global__ void rescale_solution_kernel( + double *primal_solution, double *dual_solution, + const double *variable_rescaling, const double *constraint_rescaling, + int n_vars, int n_cons); __global__ void compute_delta_solution_kernel( - const double *initial_primal, const double *pdhg_primal, - double *delta_primal, const double *initial_dual, const double *pdhg_dual, - double *delta_dual, int n_vars, int n_cons); + const double *initial_primal, const double *pdhg_primal, double *delta_primal, + const double *initial_dual, const double *pdhg_dual, double *delta_dual, + int n_vars, int n_cons); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); -static void halpern_update(pdhg_solver_state_t *state, - double reflection_coefficient); +static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient); static void rescale_solution(pdhg_solver_state_t *state); -static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, const lp_problem_t *original_problem); -static void perform_restart(pdhg_solver_state_t *state, - const pdhg_parameters_t *params); -static void -initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, - const pdhg_parameters_t *params); -static pdhg_solver_state_t *initialize_solver_state(const pdhg_parameters_t *params, - const lp_problem_t *original_problem, - const rescale_info_t *rescale_info); +static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state); +static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params); +static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params); +static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_problem, const pdhg_parameters_t* params); static void compute_fixed_point_error(pdhg_solver_state_t *state); void pdhg_solver_state_free(pdhg_solver_state_t *state); void rescale_info_free(rescale_info_t *info); @@ -113,11 +101,11 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, } working_problem = presolve_info->reduced_problem; } - rescale_info_t *rescale_info = rescale_problem(params, working_problem); - pdhg_solver_state_t *state = initialize_solver_state(params, working_problem, rescale_info); + + pdhg_solver_state_t *state = initialize_solver_state(working_problem, params); - rescale_info_free(rescale_info); initialize_step_size_and_primal_weight(state, params); + clock_t start_time = clock(); bool do_restart = false; while (state->total_count < params->termination_criteria.iteration_limit) @@ -216,16 +204,15 @@ __global__ void compute_and_rescale_reduced_cost_kernel( } } -static pdhg_solver_state_t * -initialize_solver_state(const pdhg_parameters_t *params, - const lp_problem_t *working_problem, - const rescale_info_t *rescale_info) +static pdhg_solver_state_t *initialize_solver_state( + const lp_problem_t *working_problem, + const pdhg_parameters_t* params) { - pdhg_solver_state_t *state = - (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t)); + pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t)); int n_vars = working_problem->num_variables; int n_cons = working_problem->num_constraints; + int nnz = working_problem->constraint_matrix_num_nonzeros; size_t var_bytes = n_vars * sizeof(double); size_t con_bytes = n_cons * sizeof(double); @@ -233,10 +220,8 @@ initialize_solver_state(const pdhg_parameters_t *params, state->num_constraints = n_cons; state->objective_constant = working_problem->objective_constant; - state->constraint_matrix = - (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); - state->constraint_matrix_t = - (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); + state->constraint_matrix = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); + state->constraint_matrix_t = (cu_sparse_matrix_csr_t *)safe_malloc(sizeof(cu_sparse_matrix_csr_t)); state->constraint_matrix->num_rows = n_cons; state->constraint_matrix->num_cols = n_vars; @@ -248,82 +233,47 @@ initialize_solver_state(const pdhg_parameters_t *params, state->termination_reason = TERMINATION_REASON_UNSPECIFIED; - state->rescaling_time_sec = rescale_info->rescaling_time_sec; + state->num_blocks_primal = (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); + CUBLAS_CHECK(cublasCreate(&state->blas_handle)); + CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); #define ALLOC_AND_COPY(dest, src, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice)); - ALLOC_AND_COPY(state->constraint_matrix->row_ptr, - rescale_info->scaled_problem->constraint_matrix_row_pointers, - (n_cons + 1) * sizeof(int)); - ALLOC_AND_COPY(state->constraint_matrix->col_ind, - rescale_info->scaled_problem->constraint_matrix_col_indices, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(int)); - ALLOC_AND_COPY(state->constraint_matrix->val, - rescale_info->scaled_problem->constraint_matrix_values, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(double)); - - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, - (n_vars + 1) * sizeof(int))); - CUDA_CHECK( - cudaMalloc(&state->constraint_matrix_t->col_ind, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(int))); - CUDA_CHECK( - cudaMalloc(&state->constraint_matrix_t->val, - rescale_info->scaled_problem->constraint_matrix_num_nonzeros * - sizeof(double))); - - CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); - CUBLAS_CHECK(cublasCreate(&state->blas_handle)); - CUBLAS_CHECK( - cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); - - size_t buffer_size = 0; - void *buffer = nullptr; - CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( - state->sparse_handle, state->constraint_matrix->num_rows, - state->constraint_matrix->num_cols, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->val, - state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); - CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); + ALLOC_AND_COPY(state->constraint_matrix->row_ptr, working_problem->constraint_matrix_row_pointers, (n_cons + 1) * sizeof(int)); + ALLOC_AND_COPY(state->constraint_matrix->col_ind, working_problem->constraint_matrix_col_indices, working_problem->constraint_matrix_num_nonzeros * sizeof(int)); + ALLOC_AND_COPY(state->constraint_matrix->val, working_problem->constraint_matrix_values, working_problem->constraint_matrix_num_nonzeros * sizeof(double)); - CUSPARSE_CHECK(cusparseCsr2cscEx2( - state->sparse_handle, state->constraint_matrix->num_rows, - state->constraint_matrix->num_cols, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->val, - state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, - state->constraint_matrix_t->col_ind, CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); + int* h_row_ind = build_row_ind_from_row_ptr(working_problem->constraint_matrix_row_pointers, n_cons, nnz); + ALLOC_AND_COPY(state->constraint_matrix->row_ind, h_row_ind, nnz * sizeof(int)); + free(h_row_ind); - CUDA_CHECK(cudaFree(buffer)); + ALLOC_AND_COPY(state->variable_lower_bound, working_problem->variable_lower_bound, var_bytes); + ALLOC_AND_COPY(state->variable_upper_bound, working_problem->variable_upper_bound, var_bytes); + ALLOC_AND_COPY(state->objective_vector, working_problem->objective_vector, var_bytes); + ALLOC_AND_COPY(state->constraint_lower_bound, working_problem->constraint_lower_bound, con_bytes); + ALLOC_AND_COPY(state->constraint_upper_bound, working_problem->constraint_upper_bound, con_bytes); - ALLOC_AND_COPY(state->variable_lower_bound, - rescale_info->scaled_problem->variable_lower_bound, var_bytes); - ALLOC_AND_COPY(state->variable_upper_bound, - rescale_info->scaled_problem->variable_upper_bound, var_bytes); - ALLOC_AND_COPY(state->objective_vector, - rescale_info->scaled_problem->objective_vector, var_bytes); - ALLOC_AND_COPY(state->constraint_lower_bound, - rescale_info->scaled_problem->constraint_lower_bound, - con_bytes); - ALLOC_AND_COPY(state->constraint_upper_bound, - rescale_info->scaled_problem->constraint_upper_bound, - con_bytes); - ALLOC_AND_COPY(state->constraint_rescaling, rescale_info->con_rescale, - con_bytes); - ALLOC_AND_COPY(state->variable_rescaling, rescale_info->var_rescale, - var_bytes); - - state->constraint_bound_rescaling = rescale_info->con_bound_rescale; - state->objective_vector_rescaling = rescale_info->obj_vec_rescale; + double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes)); + for (int i = 0; i < n_cons; ++i) + temp_host[i] = isfinite(working_problem->constraint_lower_bound[i]) ? working_problem->constraint_lower_bound[i] : 0.0; + ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, con_bytes); + for (int i = 0; i < n_cons; ++i) + temp_host[i] = isfinite(working_problem->constraint_upper_bound[i]) ? working_problem->constraint_upper_bound[i] : 0.0; + ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, con_bytes); + for (int i = 0; i < n_vars; ++i) + temp_host[i] = isfinite(working_problem->variable_lower_bound[i]) ? working_problem->variable_lower_bound[i] : 0.0; + ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes); + for (int i = 0; i < n_vars; ++i) + temp_host[i] = isfinite(working_problem->variable_upper_bound[i]) ? working_problem->variable_upper_bound[i] : 0.0; + ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes); + free(temp_host); #define ALLOC_ZERO(dest, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ @@ -349,63 +299,50 @@ initialize_solver_state(const pdhg_parameters_t *params, if (working_problem->primal_start) { - double *rescaled = (double *)safe_malloc(var_bytes); - for (int i = 0; i < n_vars; ++i) - rescaled[i] = working_problem->primal_start[i] * - rescale_info->var_rescale[i] * - rescale_info->con_bound_rescale; - CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, rescaled, var_bytes, - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(state->current_primal_solution, rescaled, var_bytes, - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(state->pdhg_primal_solution, rescaled, var_bytes, - cudaMemcpyHostToDevice)); - free(rescaled); + CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, + working_problem->primal_start, + var_bytes, cudaMemcpyHostToDevice)); } if (working_problem->dual_start) { - double *rescaled = (double *)safe_malloc(con_bytes); - for (int i = 0; i < n_cons; ++i) - rescaled[i] = working_problem->dual_start[i] * - rescale_info->con_rescale[i] * - rescale_info->obj_vec_rescale; - CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, rescaled, con_bytes, - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(state->current_dual_solution, rescaled, con_bytes, - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(state->pdhg_dual_solution, rescaled, con_bytes, - cudaMemcpyHostToDevice)); - free(rescaled); + CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, + working_problem->dual_start, + con_bytes, cudaMemcpyHostToDevice)); } - double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes)); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) - ? rescale_info->scaled_problem->constraint_lower_bound[i] - : 0.0; - ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, - con_bytes); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->constraint_upper_bound[i]) - ? rescale_info->scaled_problem->constraint_upper_bound[i] - : 0.0; - ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, - con_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->variable_lower_bound[i]) - ? rescale_info->scaled_problem->variable_lower_bound[i] - : 0.0; - ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = - isfinite(rescale_info->scaled_problem->variable_upper_bound[i]) - ? rescale_info->scaled_problem->variable_upper_bound[i] - : 0.0; - ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes); - free(temp_host); + rescale_info_t *rescale_info = rescale_problem(params, state); + + state->constraint_rescaling = rescale_info->con_rescale; + state->variable_rescaling = rescale_info->var_rescale; + state->rescaling_time_sec = rescale_info->rescaling_time_sec; + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, working_problem->constraint_matrix_num_nonzeros * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, working_problem->constraint_matrix_num_nonzeros * sizeof(double))); + + size_t buffer_size = 0; + void *buffer = nullptr; + CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); + CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); + + CUSPARSE_CHECK(cusparseCsr2cscEx2( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); + state->constraint_matrix_t->row_ind = NULL; + + CUDA_CHECK(cudaFree(buffer)); + + rescale_info->con_rescale = NULL; + rescale_info->var_rescale = NULL; + rescale_info_free(rescale_info); double sum_of_squares = 0.0; double max_val = 0.0; @@ -454,21 +391,12 @@ initialize_solver_state(const pdhg_parameters_t *params, } } } - if (params->optimality_norm == NORM_TYPE_L_INF) { state->constraint_bound_norm = max_val; } else { state->constraint_bound_norm = sqrt(sum_of_squares); } - state->num_blocks_primal = - (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - state->num_blocks_dual = - (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - state->num_blocks_primal_dual = - (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / - THREADS_PER_BLOCK; - state->best_primal_dual_residual_gap = INFINITY; state->last_trial_fixed_point_error = INFINITY; state->step_size = 0.0; @@ -477,77 +405,42 @@ initialize_solver_state(const pdhg_parameters_t *params, size_t primal_spmv_buffer_size; size_t dual_spmv_buffer_size; - CUSPARSE_CHECK(cusparseCreateCsr( - &state->matA, state->num_constraints, state->num_variables, - state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, - state->constraint_matrix->col_ind, state->constraint_matrix->val, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F)); - + CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, state->num_constraints, state->num_variables, state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); CUDA_CHECK(cudaGetLastError()); - CUSPARSE_CHECK(cusparseCreateCsr( - &state->matAt, state->num_variables, state->num_constraints, - state->constraint_matrix_t->num_nonzeros, - state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, - state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, state->num_variables, state->num_constraints, state->constraint_matrix_t->num_nonzeros, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); CUDA_CHECK(cudaGetLastError()); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, - state->num_variables, - state->pdhg_primal_solution, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, - state->num_constraints, - state->pdhg_dual_solution, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, - state->num_constraints, - state->primal_product, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, - state->num_variables, state->dual_product, - CUDA_R_64F)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer)); + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, + CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer)); CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, - state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, - CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); + CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, + CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); - CUDA_CHECK( - cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double))); - CUDA_CHECK( - cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double))); + CUDA_CHECK(cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double))); - double *ones_primal_h = - (double *)safe_malloc(state->num_variables * sizeof(double)); + double *ones_primal_h = (double *)safe_malloc(state->num_variables * sizeof(double)); for (int i = 0; i < state->num_variables; ++i) ones_primal_h[i] = 1.0; - CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, - state->num_variables * sizeof(double), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, state->num_variables * sizeof(double), cudaMemcpyHostToDevice)); free(ones_primal_h); - double *ones_dual_h = - (double *)safe_malloc(state->num_constraints * sizeof(double)); + double *ones_dual_h = (double *)safe_malloc(state->num_constraints * sizeof(double)); for (int i = 0; i < state->num_constraints; ++i) ones_dual_h[i] = 1.0; - CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, - state->num_constraints * sizeof(double), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h, state->num_constraints * sizeof(double), cudaMemcpyHostToDevice)); free(ones_dual_h); if (params->verbose) { printf("---------------------------------------------------------------------" @@ -651,21 +544,18 @@ __global__ void rescale_solution_kernel(double *primal_solution, double *dual_solution, const double *variable_rescaling, const double *constraint_rescaling, - const double objective_vector_rescaling, - const double constraint_bound_rescaling, int n_vars, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_vars) { primal_solution[i] = - primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling; + primal_solution[i] / variable_rescaling[i]; } else if (i < n_vars + n_cons) { int idx = i - n_vars; - dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] / - objective_vector_rescaling; + dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx]; } } @@ -775,7 +665,6 @@ static void rescale_solution(pdhg_solver_state_t *state) rescale_solution_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->pdhg_primal_solution, state->pdhg_dual_solution, state->variable_rescaling, state->constraint_rescaling, - state->objective_vector_rescaling, state->constraint_bound_rescaling, state->num_variables, state->num_constraints); } @@ -1002,9 +891,8 @@ void rescale_info_free(rescale_info_t *info) return; } - lp_problem_free(info->scaled_problem); - free(info->con_rescale); - free(info->var_rescale); + CUDA_CHECK(cudaFree(info->con_rescale)); + CUDA_CHECK(cudaFree(info->var_rescale)); free(info); } diff --git a/src/utils.cu b/src/utils.cu index 2029de9..bb046d0 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -721,7 +721,6 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) &state->absolute_primal_residual)); } - state->absolute_primal_residual /= state->constraint_bound_rescaling; if (optimality_norm == NORM_TYPE_L_INF) { state->absolute_dual_residual = get_vector_inf_norm(state->blas_handle, @@ -732,15 +731,11 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) &state->absolute_dual_residual)); } - state->absolute_dual_residual /= state->objective_vector_rescaling; CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->pdhg_primal_solution, 1, &state->primal_objective_value)); - state->primal_objective_value = - state->primal_objective_value / (state->constraint_bound_rescaling * - state->objective_vector_rescaling) + - state->objective_constant; + state->primal_objective_value = state->primal_objective_value + state->objective_constant; double base_dual_objective; CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, @@ -749,10 +744,7 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) double dual_slack_sum = get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack); - state->dual_objective_value = (base_dual_objective + dual_slack_sum) / - (state->constraint_bound_rescaling * - state->objective_vector_rescaling) + - state->objective_constant; + state->dual_objective_value = base_dual_objective + dual_slack_sum + state->objective_constant; state->relative_primal_residual = state->absolute_primal_residual / (1.0 + state->constraint_bound_norm); @@ -812,8 +804,6 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->delta_primal_solution, 1, &state->primal_ray_linear_objective)); - state->primal_ray_linear_objective /= - (state->constraint_bound_rescaling * state->objective_vector_rescaling); dual_solution_dual_objective_contribution_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( @@ -833,9 +823,7 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) double sum_dual_slack = get_vector_sum(state->blas_handle, state->num_variables, state->ones_primal_d, state->dual_slack); - state->dual_ray_objective = - (sum_primal_slack + sum_dual_slack) / - (state->constraint_bound_rescaling * state->objective_vector_rescaling); + state->dual_ray_objective = sum_primal_slack + sum_dual_slack; compute_primal_infeasibility_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( @@ -1303,3 +1291,19 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_so double dual_slack_sum = get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack); state->dual_objective_value = (base_dual_objective + dual_slack_sum) / (state->constraint_bound_rescaling * state->objective_vector_rescaling) + state->objective_constant; } + +int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz) +{ + if (!row_ptr || num_rows < 0 || nnz <= 0) return NULL; + + int* row_ind = (int*)safe_malloc(nnz * sizeof(int)); + for (int i = 0; i < num_rows; ++i) { + int s = row_ptr[i]; + int e = row_ptr[i + 1]; + for (int k = s; k < e; ++k) { + row_ind[k] = i; + } + } + return row_ind; +} + From 4874ee5dbb9ab719c26ff00108ad3cae7d3a3645 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 03:53:04 -0500 Subject: [PATCH 02/20] New feat: synchronized A/At scaling --- internal/internal_types.h | 1 + internal/utils.h | 4 +- src/preconditioner.c | 306 --------------------------- src/preconditioner.cu | 424 +++++++++++++++++--------------------- src/solver.cu | 161 +++++++++++++-- 5 files changed, 337 insertions(+), 559 deletions(-) delete mode 100644 src/preconditioner.c diff --git a/internal/internal_types.h b/internal/internal_types.h index 2022c33..bd3994a 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -30,6 +30,7 @@ typedef struct int *col_ind; int *row_ind; double *val; + int *transpose_pos; } cu_sparse_matrix_csr_t; typedef struct diff --git a/internal/utils.h b/internal/utils.h index 775459f..548d61f 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -122,7 +122,6 @@ extern "C" int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals, int *nnz_out); -<<<<<<< HEAD void check_feas_polishing_termination_criteria( pdhg_solver_state_t *solver_state, const termination_criteria_t *criteria, @@ -139,9 +138,8 @@ extern "C" void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm); void set_default_parameters(pdhg_parameters_t *params); -======= + int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz); ->>>>>>> 2a31c99 (New feat: GPU precondition) #ifdef __cplusplus } diff --git a/src/preconditioner.c b/src/preconditioner.c deleted file mode 100644 index bf8872b..0000000 --- a/src/preconditioner.c +++ /dev/null @@ -1,306 +0,0 @@ -/* -Copyright 2025 Haihao Lu - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "preconditioner.h" -#include "utils.h" -#include -#include -#include -#include - -#define SCALING_EPSILON 1e-12 - -static lp_problem_t *deepcopy_problem(const lp_problem_t *prob); -static void scale_problem(lp_problem_t *problem, const double *con_rescale, - const double *var_rescale); -static void ruiz_rescaling(lp_problem_t *problem, int num_iters, - double *cum_con_rescale, double *cum_var_rescale); -static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha, - double *cum_con_rescale, - double *cum_var_rescale); - -static lp_problem_t *deepcopy_problem(const lp_problem_t *prob) -{ - lp_problem_t *new_prob = (lp_problem_t *)safe_malloc(sizeof(lp_problem_t)); - - new_prob->num_variables = prob->num_variables; - new_prob->num_constraints = prob->num_constraints; - new_prob->constraint_matrix_num_nonzeros = - prob->constraint_matrix_num_nonzeros; - new_prob->objective_constant = prob->objective_constant; - - size_t var_bytes = prob->num_variables * sizeof(double); - size_t con_bytes = prob->num_constraints * sizeof(double); - size_t nnz_bytes_val = prob->constraint_matrix_num_nonzeros * sizeof(double); - size_t nnz_bytes_col = prob->constraint_matrix_num_nonzeros * sizeof(int); - size_t row_ptr_bytes = (prob->num_constraints + 1) * sizeof(int); - - new_prob->variable_lower_bound = safe_malloc(var_bytes); - new_prob->variable_upper_bound = safe_malloc(var_bytes); - new_prob->objective_vector = safe_malloc(var_bytes); - new_prob->constraint_lower_bound = safe_malloc(con_bytes); - new_prob->constraint_upper_bound = safe_malloc(con_bytes); - new_prob->constraint_matrix_row_pointers = safe_malloc(row_ptr_bytes); - new_prob->constraint_matrix_col_indices = safe_malloc(nnz_bytes_col); - new_prob->constraint_matrix_values = safe_malloc(nnz_bytes_val); - - memcpy(new_prob->variable_lower_bound, prob->variable_lower_bound, var_bytes); - memcpy(new_prob->variable_upper_bound, prob->variable_upper_bound, var_bytes); - memcpy(new_prob->objective_vector, prob->objective_vector, var_bytes); - memcpy(new_prob->constraint_lower_bound, prob->constraint_lower_bound, - con_bytes); - memcpy(new_prob->constraint_upper_bound, prob->constraint_upper_bound, - con_bytes); - memcpy(new_prob->constraint_matrix_row_pointers, - prob->constraint_matrix_row_pointers, row_ptr_bytes); - memcpy(new_prob->constraint_matrix_col_indices, - prob->constraint_matrix_col_indices, nnz_bytes_col); - memcpy(new_prob->constraint_matrix_values, prob->constraint_matrix_values, - nnz_bytes_val); - - if (prob->primal_start) - { - new_prob->primal_start = safe_malloc(var_bytes); - memcpy(new_prob->primal_start, prob->primal_start, var_bytes); - } - else - { - new_prob->primal_start = NULL; - } - if (prob->dual_start) - { - new_prob->dual_start = safe_malloc(con_bytes); - memcpy(new_prob->dual_start, prob->dual_start, con_bytes); - } - else - { - new_prob->dual_start = NULL; - } - - return new_prob; -} - -static void scale_problem(lp_problem_t *problem, - const double *constraint_rescaling, - const double *variable_rescaling) -{ - for (int i = 0; i < problem->num_variables; ++i) - { - problem->objective_vector[i] /= variable_rescaling[i]; - problem->variable_upper_bound[i] *= variable_rescaling[i]; - problem->variable_lower_bound[i] *= variable_rescaling[i]; - } - for (int i = 0; i < problem->num_constraints; ++i) - { - problem->constraint_lower_bound[i] /= constraint_rescaling[i]; - problem->constraint_upper_bound[i] /= constraint_rescaling[i]; - } - - for (int row = 0; row < problem->num_constraints; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - problem->constraint_matrix_values[nz_idx] /= - (constraint_rescaling[row] * variable_rescaling[col]); - } - } -} - -static void ruiz_rescaling(lp_problem_t *problem, int num_iterations, - double *cum_constraint_rescaling, - double *cum_variable_rescaling) -{ - int num_cons = problem->num_constraints; - int num_vars = problem->num_variables; - double *con_rescale = safe_malloc(num_cons * sizeof(double)); - double *var_rescale = safe_malloc(num_vars * sizeof(double)); - - for (int iter = 0; iter < num_iterations; ++iter) - { - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = 0.0; - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = 0.0; - - for (int row = 0; row < num_cons; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; - ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - if (col < 0 || col >= num_vars) - { - fprintf(stderr, - "Error: Invalid column index %d at nz_idx %d for row %d. " - "Must be in [0, %d).\n", - col, nz_idx, row, num_vars); - } - double val = fabs(problem->constraint_matrix_values[nz_idx]); - if (val > var_rescale[col]) - var_rescale[col] = val; - if (val > con_rescale[row]) - con_rescale[row] = val; - } - } - - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = - (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]); - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = - (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]); - - scale_problem(problem, con_rescale, var_rescale); - for (int i = 0; i < num_vars; ++i) - cum_variable_rescaling[i] *= var_rescale[i]; - for (int i = 0; i < num_cons; ++i) - cum_constraint_rescaling[i] *= con_rescale[i]; - } - free(con_rescale); - free(var_rescale); -} - -static void pock_chambolle_rescaling(lp_problem_t *problem, double alpha, - double *cum_constraint_rescaling, - double *cum_variable_rescaling) -{ - int num_cons = problem->num_constraints; - int num_vars = problem->num_variables; - double *con_rescale = safe_calloc(num_cons, sizeof(double)); - double *var_rescale = safe_calloc(num_vars, sizeof(double)); - - for (int row = 0; row < num_cons; ++row) - { - for (int nz_idx = problem->constraint_matrix_row_pointers[row]; - nz_idx < problem->constraint_matrix_row_pointers[row + 1]; ++nz_idx) - { - int col = problem->constraint_matrix_col_indices[nz_idx]; - double val = fabs(problem->constraint_matrix_values[nz_idx]); - var_rescale[col] += pow(val, 2.0 - alpha); - con_rescale[row] += pow(val, alpha); - } - } - - for (int i = 0; i < num_vars; ++i) - var_rescale[i] = - (var_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(var_rescale[i]); - for (int i = 0; i < num_cons; ++i) - con_rescale[i] = - (con_rescale[i] < SCALING_EPSILON) ? 1.0 : sqrt(con_rescale[i]); - - scale_problem(problem, con_rescale, var_rescale); - for (int i = 0; i < num_vars; ++i) - cum_variable_rescaling[i] *= var_rescale[i]; - for (int i = 0; i < num_cons; ++i) - cum_constraint_rescaling[i] *= con_rescale[i]; - - free(con_rescale); - free(var_rescale); -} - -rescale_info_t *rescale_problem(const pdhg_parameters_t *params, - const lp_problem_t *working_problem) -{ - clock_t start_rescaling = clock(); - rescale_info_t *rescale_info = - (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - rescale_info->scaled_problem = deepcopy_problem(working_problem); - if (rescale_info->scaled_problem == NULL) - { - fprintf(stderr, - "Failed to create a copy of the problem. Aborting rescale.\n"); - return NULL; - } - int num_cons = working_problem->num_constraints; - int num_vars = working_problem->num_variables; - - rescale_info->con_rescale = safe_malloc(num_cons * sizeof(double)); - rescale_info->var_rescale = safe_malloc(num_vars * sizeof(double)); - for (int i = 0; i < num_cons; ++i) - rescale_info->con_rescale[i] = 1.0; - for (int i = 0; i < num_vars; ++i) - rescale_info->var_rescale[i] = 1.0; - if (params->l_inf_ruiz_iterations > 0) - { - ruiz_rescaling(rescale_info->scaled_problem, params->l_inf_ruiz_iterations, - rescale_info->con_rescale, rescale_info->var_rescale); - } - if (params->has_pock_chambolle_alpha) - { - pock_chambolle_rescaling( - rescale_info->scaled_problem, params->pock_chambolle_alpha, - rescale_info->con_rescale, rescale_info->var_rescale); - } - if (params->bound_objective_rescaling) - { - double bound_norm_sq = 0.0; - for (int i = 0; i < num_cons; ++i) - { - if (isfinite(rescale_info->scaled_problem->constraint_lower_bound[i]) && - (rescale_info->scaled_problem->constraint_lower_bound[i] != - rescale_info->scaled_problem->constraint_upper_bound[i])) - { - bound_norm_sq += - rescale_info->scaled_problem->constraint_lower_bound[i] * - rescale_info->scaled_problem->constraint_lower_bound[i]; - } - if (isfinite(rescale_info->scaled_problem->constraint_upper_bound[i])) - { - bound_norm_sq += - rescale_info->scaled_problem->constraint_upper_bound[i] * - rescale_info->scaled_problem->constraint_upper_bound[i]; - } - } - - double obj_norm_sq = 0.0; - for (int i = 0; i < num_vars; ++i) - { - obj_norm_sq += rescale_info->scaled_problem->objective_vector[i] * - rescale_info->scaled_problem->objective_vector[i]; - } - - rescale_info->con_bound_rescale = 1.0 / (sqrt(bound_norm_sq) + 1.0); - rescale_info->obj_vec_rescale = 1.0 / (sqrt(obj_norm_sq) + 1.0); - - for (int i = 0; i < num_cons; ++i) - { - rescale_info->scaled_problem->constraint_lower_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->constraint_upper_bound[i] *= - rescale_info->con_bound_rescale; - } - for (int i = 0; i < num_vars; ++i) - { - rescale_info->scaled_problem->variable_lower_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->variable_upper_bound[i] *= - rescale_info->con_bound_rescale; - rescale_info->scaled_problem->objective_vector[i] *= - rescale_info->obj_vec_rescale; - } - } - else - { - rescale_info->con_bound_rescale = 1.0; - rescale_info->obj_vec_rescale = 1.0; - } - rescale_info->rescaling_time_sec = - (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; - return rescale_info; -} \ No newline at end of file diff --git a/src/preconditioner.cu b/src/preconditioner.cu index e197905..ec3a6f2 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -19,7 +19,6 @@ limitations under the License. #include #include #include -#include #include #include @@ -44,7 +43,9 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, int m); __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, const int* __restrict__ col_ind, - double* __restrict__ vals, + double* __restrict__ A_vals, + double* __restrict__ At_vals, + const int* __restrict__ A_to_At, const double* __restrict__ invD, const double* __restrict__ invE, int nnz); @@ -52,205 +53,34 @@ __global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, const double* __restrict__ vals, int num_rows, double* __restrict__ out_max); -__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - unsigned long long* __restrict__ out_max_bits); -__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, - double* __restrict__ out_val, - int n); __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, const double* __restrict__ vals, int num_rows, double degree, double* __restrict__ out_sum); -__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - double degree, - double* __restrict__ out_sum); __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, double* __restrict__ cum, int n); +__global__ void reduce_bound_norm_sq_atomic( + const double* __restrict__ L, + const double* __restrict__ U, + int m, + double* __restrict__ out_sum); +__global__ void fill_const_and_accum_kernel(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + double val, + int n); static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); +__global__ void fill_ones_kernel(double* __restrict__ x, int n); static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -__global__ void scale_variables_kernel(double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, - const double* __restrict__ D, - const double* __restrict__ invD, - int n) -{ - int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; - double dj = D[j]; - double inv_dj = invD[j]; - c[j] *= inv_dj; - var_lb[j] *= dj; - var_ub[j] *= dj; - var_lb_finite[j] *= dj; - var_ub_finite[j] *= dj; - primal_start[j] *= dj; -} - -__global__ void scale_constraints_kernel(double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, - const double* __restrict__ E, - const double* __restrict__ invE, - int m) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= m) return; - double inv_ei = invE[i]; - double ei = E[i]; - con_lb[i] *= inv_ei; - con_ub[i] *= inv_ei; - con_lb_finite[i] *= inv_ei; - con_ub_finite[i] *= inv_ei; - dual_start[i] *= ei; -} - -__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, - const int* __restrict__ col_ind, - double* __restrict__ vals, - const double* __restrict__ invD, - const double* __restrict__ invE, - int nnz) -{ - for (int k = blockIdx.x * blockDim.x + threadIdx.x; - k < nnz; - k += gridDim.x * blockDim.x) - { - int i = row_ids[k]; - int j = col_ind[k]; - vals[k] *= invD[j] * invE[i]; - } -} - -__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double* __restrict__ out_max) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; - int s = row_ptr[i], e = row_ptr[i + 1]; - double m = 0.0; - for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - if (v > m) m = v; - } - out_max[i] = m; -} - -__global__ void csr_col_absmax_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - unsigned long long* __restrict__ out_max_bits) -{ - for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { - int j = col_ind[k]; - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - unsigned long long bits = __double_as_longlong(v); - atomicMax(&out_max_bits[j], bits); - } -} - -__global__ void u64bits_to_double(const unsigned long long* __restrict__ in_bits, - double* __restrict__ out_val, - int n) -{ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) { - out_val[i] = __longlong_as_double(in_bits[i]); - } -} - -__device__ __forceinline__ double pow_fast(double v, double p) { - if (p == 2.0) return v * v; - if (p == 1.0) return v; - if (p == 0.5) return sqrt(v); - return pow(v, p); -} - -__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double degree, - double* __restrict__ out_sum) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; - int s = row_ptr[i], e = row_ptr[i + 1]; - double acc = 0.0; - for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - acc += pow_fast(v, degree); - } - out_sum[i] = acc; -} - -__global__ void csr_col_powsum_atomic_kernel(const int* __restrict__ col_ind, - const double* __restrict__ vals, - int nnz, - double degree, - double* __restrict__ out_sum) -{ - for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { - int j = col_ind[k]; - double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - double t = pow_fast(v, degree); - atomicAdd(&out_sum[j], t); - } -} - -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - int n) -{ - for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) - { - double v = x[t]; - double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); - cum[t] *= s; - x[t] = s; - inv_x[t] = 1.0 / s; - } -} - -__global__ void reduce_bound_norm_sq_atomic( - const double* __restrict__ L, - const double* __restrict__ U, - int m, - double* __restrict__ out_sum) -{ - double acc = 0.0; - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { - double Li = L[i], Ui = U[i]; - bool fL = isfinite(Li), fU = isfinite(Ui); - if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; - if (fU) acc += Ui * Ui; - } - atomicAdd(out_sum, acc); -} - static void scale_problem( pdhg_solver_state_t *state, double *E, @@ -286,6 +116,8 @@ static void scale_problem( state->constraint_matrix->row_ind, state->constraint_matrix->col_ind, state->constraint_matrix->val, + state->constraint_matrix_t->val, + state->constraint_matrix->transpose_pos, invD, invE, state->constraint_matrix->num_nonzeros); @@ -302,10 +134,6 @@ static void ruiz_rescaling( { const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - const int nnz = state->constraint_matrix->num_nonzeros; - - unsigned long long *D_bits=nullptr; - CUDA_CHECK(cudaMalloc(&D_bits, n_vars*sizeof(unsigned long long))); for (int iter = 0; iter < num_iterations; ++iter) { @@ -320,13 +148,11 @@ static void ruiz_rescaling( rescale_info->con_rescale, n_cons); - CUDA_CHECK(cudaMemset(D_bits, 0, n_vars*sizeof(unsigned long long))); - csr_col_absmax_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( - state->constraint_matrix->col_ind, - state->constraint_matrix->val, - nnz, - D_bits); - u64bits_to_double<<num_blocks_primal, THREADS_PER_BLOCK>>>(D_bits, D, n_vars); + csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->val, + n_vars, + D); clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, invD, @@ -335,8 +161,6 @@ static void ruiz_rescaling( scale_problem(state, E, D, invE, invD); } - - CUDA_CHECK(cudaFree(D_bits)); } static void pock_chambolle_rescaling( @@ -350,7 +174,6 @@ static void pock_chambolle_rescaling( { const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - const int nnz = state->constraint_matrix->num_nonzeros; csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, @@ -364,12 +187,11 @@ static void pock_chambolle_rescaling( rescale_info->con_rescale, n_cons); - CUDA_CHECK(cudaMemset(D, 0, n_vars*sizeof(double))); - csr_col_powsum_atomic_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( - state->constraint_matrix->col_ind, - state->constraint_matrix->val, - nnz, - 2.0 - alpha, + csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->val, + n_vars, + alpha, D); clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, @@ -415,25 +237,10 @@ static void bound_objective_rescaling( const double E_const = bnd_norm + 1.0; const double D_const = obj_norm + 1.0; - { - std::vector h1(n_cons,E_const), h2(n_vars,D_const); - CUDA_CHECK(cudaMemcpy(E, h1.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D, h2.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); - std::vector h3(n_cons,1/E_const), h4(n_vars,1/D_const); - CUDA_CHECK(cudaMemcpy(invE, h3.data(), n_cons*sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(invD, h4.data(), n_vars*sizeof(double), cudaMemcpyHostToDevice)); - } - - CUBLAS_CHECK(cublasDscal(state->blas_handle, - n_cons, - &E_const, - rescale_info->con_rescale, - 1)); - CUBLAS_CHECK(cublasDscal(state->blas_handle, - n_vars, - &D_const, - rescale_info->var_rescale, - 1)); + fill_const_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + E, invE, rescale_info->con_rescale, E_const, n_cons); + fill_const_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + D, invD, rescale_info->var_rescale, D_const, n_vars); scale_problem(state, E, D, invE, invD); } @@ -447,17 +254,12 @@ rescale_info_t *rescale_problem( clock_t start_rescaling = clock(); rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - rescale_info->con_rescale = nullptr; - rescale_info->var_rescale = nullptr; CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double))); CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double))); - { - std::vector h1(n_cons,1.0), h2(n_vars,1.0); - CUDA_CHECK(cudaMemcpy(rescale_info->con_rescale, h1.data(), - n_cons*sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(rescale_info->var_rescale, h2.data(), - n_vars*sizeof(double), cudaMemcpyHostToDevice)); - } + fill_ones_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + rescale_info->con_rescale, n_cons); + fill_ones_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + rescale_info->var_rescale, n_vars); double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr; CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double))); @@ -486,4 +288,160 @@ rescale_info_t *rescale_problem( CUDA_CHECK(cudaFree(invD)); return rescale_info; -} \ No newline at end of file +} + +__global__ void scale_variables_kernel(double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + const double* __restrict__ D, + const double* __restrict__ invD, + int n) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + double dj = D[j]; + double inv_dj = invD[j]; + c[j] *= inv_dj; + var_lb[j] *= dj; + var_ub[j] *= dj; + var_lb_finite[j] *= dj; + var_ub_finite[j] *= dj; + primal_start[j] *= dj; +} + +__global__ void scale_constraints_kernel(double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + const double* __restrict__ E, + const double* __restrict__ invE, + int m) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= m) return; + double inv_ei = invE[i]; + double ei = E[i]; + con_lb[i] *= inv_ei; + con_ub[i] *= inv_ei; + con_lb_finite[i] *= inv_ei; + con_ub_finite[i] *= inv_ei; + dual_start[i] *= ei; +} + +__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, + const int* __restrict__ col_ind, + double* __restrict__ A_vals, + double* __restrict__ At_vals, + const int* __restrict__ A_to_At, + const double* __restrict__ invD, + const double* __restrict__ invE, + int nnz) +{ + for (int k = blockIdx.x * blockDim.x + threadIdx.x; + k < nnz; + k += gridDim.x * blockDim.x) + { + int i = row_ids[k]; + int j = col_ind[k]; + double scale = invD[j] * invE[i]; + A_vals[k] *= scale; + At_vals[A_to_At[k]] *= scale; + } +} + +__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double* __restrict__ out_max) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double m = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + if (v > m) m = v; + } + out_max[i] = m; +} + +__device__ __forceinline__ double pow_fast(double v, double p) { + if (p == 2.0) return v * v; + if (p == 1.0) return v; + if (p == 0.5) return sqrt(v); + return pow(v, p); +} + +__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, + const double* __restrict__ vals, + int num_rows, + double degree, + double* __restrict__ out_sum) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + int s = row_ptr[i], e = row_ptr[i + 1]; + double acc = 0.0; + for (int k = s; k < e; ++k) { + double v = fabs(vals[k]); + if (!isfinite(v)) v = 0.0; + acc += pow_fast(v, degree); + } + out_sum[i] = acc; +} + +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + int n) +{ + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) + { + double v = x[t]; + double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); + cum[t] *= s; + x[t] = s; + inv_x[t] = 1.0 / s; + } +} + +__global__ void reduce_bound_norm_sq_atomic( + const double* __restrict__ L, + const double* __restrict__ U, + int m, + double* __restrict__ out_sum) +{ + double acc = 0.0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { + double Li = L[i], Ui = U[i]; + bool fL = isfinite(Li), fU = isfinite(Ui); + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; + if (fU) acc += Ui * Ui; + } + atomicAdd(out_sum, acc); +} + +__global__ void fill_const_and_accum_kernel(double* __restrict__ x, + double* __restrict__ inv_x, + double* __restrict__ cum, + double val, + int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + x[i] = val; + inv_x[i] = 1.0 / val; + cum[i] *= val; +} + +__global__ void fill_ones_kernel(double* __restrict__ x, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) x[i] = 1.0; +} diff --git a/src/solver.cu b/src/solver.cu index a88a25c..031e514 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -28,6 +28,15 @@ limitations under the License. #include #include +__global__ void build_row_ind(const int* __restrict__ row_ptr, + int num_rows, + int* __restrict__ row_ind); +__global__ void build_transpose_pos(const int* __restrict__ A_row_ind, + const int* __restrict__ A_col_ind, + const int* __restrict__ At_row_ptr, + const int* __restrict__ At_col_ind, + int nnz, + int* __restrict__ A_to_At); __global__ void compute_next_pdhg_primal_solution_kernel( const double *current_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, const double *var_ub, @@ -250,9 +259,49 @@ static pdhg_solver_state_t *initialize_solver_state( ALLOC_AND_COPY(state->constraint_matrix->col_ind, working_problem->constraint_matrix_col_indices, working_problem->constraint_matrix_num_nonzeros * sizeof(int)); ALLOC_AND_COPY(state->constraint_matrix->val, working_problem->constraint_matrix_values, working_problem->constraint_matrix_num_nonzeros * sizeof(double)); - int* h_row_ind = build_row_ind_from_row_ptr(working_problem->constraint_matrix_row_pointers, n_cons, nnz); - ALLOC_AND_COPY(state->constraint_matrix->row_ind, h_row_ind, nnz * sizeof(int)); - free(h_row_ind); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->row_ind, nnz * sizeof(int))); + build_row_ind<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, n_cons, state->constraint_matrix->row_ind); + CUDA_CHECK(cudaGetLastError()); + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double))); + + size_t buffer_size = 0; + void *buffer = nullptr; + CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); + CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); + + CUSPARSE_CHECK(cusparseCsr2cscEx2( + state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, + state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, + state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, + CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); + + CUDA_CHECK(cudaFree(buffer)); + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ind, nnz * sizeof(int))); + build_row_ind<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind); + CUDA_CHECK(cudaGetLastError()); + + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_pos, nnz * sizeof(int))); + state->constraint_matrix_t->transpose_pos = NULL; + build_transpose_pos<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ind, + state->constraint_matrix->col_ind, + state->constraint_matrix_t->row_ptr, + state->constraint_matrix_t->col_ind, + nnz, + state->constraint_matrix->transpose_pos); + CUDA_CHECK(cudaGetLastError()); ALLOC_AND_COPY(state->variable_lower_bound, working_problem->variable_lower_bound, var_bytes); ALLOC_AND_COPY(state->variable_upper_bound, working_problem->variable_upper_bound, var_bytes); @@ -344,6 +393,13 @@ static pdhg_solver_state_t *initialize_solver_state( rescale_info->var_rescale = NULL; rescale_info_free(rescale_info); + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); + state->constraint_matrix->row_ind = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); + state->constraint_matrix_t->row_ind = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); + state->constraint_matrix->transpose_pos = NULL; + double sum_of_squares = 0.0; double max_val = 0.0; double val = 0.0; @@ -457,6 +513,51 @@ static pdhg_solver_state_t *initialize_solver_state( return state; } +__global__ void build_row_ind(const int* __restrict__ row_ptr, + int num_rows, + int* __restrict__ row_ind) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_rows) return; + + int s = row_ptr[i]; + int e = row_ptr[i + 1]; + + for (int k = s; k < e; ++k) { + row_ind[k] = i; + } +} + +__global__ void build_transpose_pos( + const int* __restrict__ A_row_ind, + const int* __restrict__ A_col_ind, + const int* __restrict__ At_row_ptr, + const int* __restrict__ At_col_ind, + int nnz, + int* __restrict__ A_to_At) +{ + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= nnz) return; + + int i = A_row_ind[k]; + int j = A_col_ind[k]; + + int start = At_row_ptr[j]; + int end = At_row_ptr[j + 1]; + + int pos = -1; + for (int idx = start; idx < end; ++idx) { + if (At_col_ind[idx] == i) { + pos = idx; + break; + } + } + + if (pos < 0) return; + + A_to_At[k] = pos; +} + __global__ void compute_next_pdhg_primal_solution_kernel( const double *current_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, @@ -734,21 +835,13 @@ static void perform_restart(pdhg_solver_state_t *state, state->last_trial_fixed_point_error = INFINITY; } -static void -initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, - const pdhg_parameters_t *params) +static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) { - if (state->constraint_matrix->num_nonzeros == 0) - { - state->step_size = 1.0; - } - else - { - double max_sv = estimate_maximum_singular_value( - state->sparse_handle, state->blas_handle, state->constraint_matrix, - state->constraint_matrix_t, params->sv_max_iter, params->sv_tol); - state->step_size = 0.998 / max_sv; - } + double max_sv = estimate_maximum_singular_value( + state->sparse_handle, state->blas_handle, state->constraint_matrix, + state->constraint_matrix_t, 5000, 1e-4); + state->step_size = 0.998 / max_sv; if (params->bound_objective_rescaling) { @@ -810,6 +903,27 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) return; } + if (state->matA) + CUSPARSE_CHECK(cusparseDestroySpMat(state->matA)); + if (state->matAt) + CUSPARSE_CHECK(cusparseDestroySpMat(state->matAt)); + if (state->vec_primal_sol) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_sol)); + if (state->vec_dual_sol) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_sol)); + if (state->vec_primal_prod) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_primal_prod)); + if (state->vec_dual_prod) + CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); + if (state->primal_spmv_buffer) + CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); + if (state->dual_spmv_buffer) + CUDA_CHECK(cudaFree(state->dual_spmv_buffer)); + if (state->sparse_handle) + CUSPARSE_CHECK(cusparseDestroy(state->sparse_handle)); + if (state->blas_handle) + CUBLAS_CHECK(cublasDestroy(state->blas_handle)); + if (state->variable_lower_bound) CUDA_CHECK(cudaFree(state->variable_lower_bound)); if (state->variable_upper_bound) @@ -820,14 +934,22 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUDA_CHECK(cudaFree(state->constraint_matrix->row_ptr)); if (state->constraint_matrix->col_ind) CUDA_CHECK(cudaFree(state->constraint_matrix->col_ind)); + if (state->constraint_matrix->row_ind) + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); if (state->constraint_matrix->val) CUDA_CHECK(cudaFree(state->constraint_matrix->val)); + if (state->constraint_matrix->transpose_pos) + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); if (state->constraint_matrix_t->row_ptr) CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr)); if (state->constraint_matrix_t->col_ind) CUDA_CHECK(cudaFree(state->constraint_matrix_t->col_ind)); + if (state->constraint_matrix_t->row_ind) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); if (state->constraint_matrix_t->val) CUDA_CHECK(cudaFree(state->constraint_matrix_t->val)); + if (state->constraint_matrix_t->transpose_pos) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_pos)); if (state->constraint_lower_bound) CUDA_CHECK(cudaFree(state->constraint_lower_bound)); if (state->constraint_upper_bound) @@ -881,6 +1003,11 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) if (state->ones_dual_d) CUDA_CHECK(cudaFree(state->ones_dual_d)); + if (state->constraint_matrix) + free(state->constraint_matrix); + if (state->constraint_matrix_t) + free(state->constraint_matrix_t); + free(state); } From 1901b1e58745ca442147f6b7137a62d4e256b21b Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 03:53:39 -0500 Subject: [PATCH 03/20] Todo: infeasible is stucked --- test/test_infeasible_unbounded.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_infeasible_unbounded.py b/test/test_infeasible_unbounded.py index d09d9a9..4388cc4 100644 --- a/test/test_infeasible_unbounded.py +++ b/test/test_infeasible_unbounded.py @@ -63,13 +63,13 @@ def test_infeasible_lp(base_lp_data, atol): # turn off output model.setParams(OutputFlag=False) # optimize - model.optimize() + #model.optimize() # check status - assert hasattr(model, "Status"), "Model.Status not exposed." - assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" - assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" + #assert hasattr(model, "Status"), "Model.Status not exposed." + #assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" + #assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" # check dual ray - assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" + #assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" def test_unbounded_lp(base_lp_data, atol): From 8fb1d196d2c98e0a312d48f480e16c1e4e5c4ad7 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 16:45:58 -0500 Subject: [PATCH 04/20] New feat: preconditioner prints --- src/preconditioner.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index ec3a6f2..1b9d8dd 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -249,6 +249,8 @@ rescale_info_t *rescale_problem( const pdhg_parameters_t *params, pdhg_solver_state_t *state) { + printf("[Precondition] Start\n"); + int n_vars = state->num_variables; int n_cons = state->num_constraints; @@ -269,14 +271,17 @@ rescale_info_t *rescale_problem( if (params->l_inf_ruiz_iterations > 0) { + printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations); ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD); } if (params->has_pock_chambolle_alpha) { + printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha); pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); } if (params->bound_objective_rescaling) { + printf("[Precondition] Bound-objective scaling\n"); bound_objective_rescaling(state, rescale_info, E, D, invE, invD); } From 732c4be69880dd6339e2cdb828e96cf544c5b1f5 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Sat, 15 Nov 2025 21:39:58 -0500 Subject: [PATCH 05/20] Bug fixed: objective & bound rescale --- internal/internal_types.h | 4 + pyproject.toml | 2 +- src/preconditioner.cu | 188 +++++++++++++++++++++--------- src/solver.cu | 131 +++++++++++---------- src/utils.cu | 20 +++- test/test_infeasible_unbounded.py | 10 +- 6 files changed, 230 insertions(+), 125 deletions(-) diff --git a/internal/internal_types.h b/internal/internal_types.h index bd3994a..3ce8c23 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -77,6 +77,8 @@ typedef struct double *constraint_rescaling; double *variable_rescaling; + double constraint_bound_rescaling; + double objective_vector_rescaling; double *primal_slack; double *dual_slack; double rescaling_time_sec; @@ -132,5 +134,7 @@ typedef struct { double *con_rescale; double *var_rescale; + double con_bound_rescale; + double obj_vec_rescale; double rescaling_time_sec; } rescale_info_t; diff --git a/pyproject.toml b/pyproject.toml index 6c73d8c..2319b81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ wheel.packages = ["python/cupdlpx"] sdist.include = ["tests/**", "pyproject.toml", "README.md", "LICENSE"] [tool.scikit-build.cmake.define] -CMAKE_CUDA_ARCHITECTURES = "60;61;70;75;80;86;89;90;90-virtual" +CMAKE_CUDA_ARCHITECTURES = "all" CMAKE_CUDA_STANDARD = "17" CUPDLPX_BUILD_PYTHON = "ON" CUPDLPX_BUILD_STATIC_LIB = "ON" diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 1b9d8dd..80c77da 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -32,7 +32,7 @@ __global__ void scale_variables_kernel(double* __restrict__ c, double* __restrict__ primal_start, const double* __restrict__ D, const double* __restrict__ invD, - int n); + int n_vars); __global__ void scale_constraints_kernel(double* __restrict__ con_lb, double* __restrict__ con_ub, double* __restrict__ con_lb_finite, @@ -40,7 +40,7 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, double* __restrict__ dual_start, const double* __restrict__ E, const double* __restrict__ invE, - int m); + int n_cons); __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, const int* __restrict__ col_ind, double* __restrict__ A_vals, @@ -61,25 +61,38 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, double* __restrict__ cum, - int n); -__global__ void reduce_bound_norm_sq_atomic( + int n_vars); +__global__ void reduce_bound_norm_sq_kernel( const double* __restrict__ L, const double* __restrict__ U, - int m, - double* __restrict__ out_sum); -__global__ void fill_const_and_accum_kernel(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - double val, - int n); + int n_cons, + double* __restrict__ block_sums); +__global__ void scale_bounds_kernel( + double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + int n_cons, + double con_scale, + double obj_scale); +__global__ void scale_obj_kernel( + double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + int n_vars, + double con_scale, + double obj_scale); +__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars); static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -__global__ void fill_ones_kernel(double* __restrict__ x, int n); static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info, - double *E, double *D, double *invE, double *invD); +static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info); static void scale_problem( pdhg_solver_state_t *state, @@ -191,7 +204,7 @@ static void pock_chambolle_rescaling( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, n_vars, - alpha, + 2.0 - alpha, D); clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, @@ -205,12 +218,7 @@ static void pock_chambolle_rescaling( static void bound_objective_rescaling( pdhg_solver_state_t *state, - rescale_info_t *rescale_info, - double *E, - double *D, - double *invE, - double *invD - ) + rescale_info_t *rescale_info) { const int n_cons = state->num_constraints; const int n_vars = state->num_variables; @@ -218,7 +226,7 @@ static void bound_objective_rescaling( double *bnd_norm_sq_cuda = nullptr; CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double))); CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double))); - reduce_bound_norm_sq_atomic<<num_blocks_dual, THREADS_PER_BLOCK>>>( + reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( state->constraint_lower_bound, state->constraint_upper_bound, n_cons, @@ -235,14 +243,32 @@ static void bound_objective_rescaling( state->objective_vector, 1, &obj_norm)); - const double E_const = bnd_norm + 1.0; - const double D_const = obj_norm + 1.0; - fill_const_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - E, invE, rescale_info->con_rescale, E_const, n_cons); - fill_const_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - D, invD, rescale_info->var_rescale, D_const, n_vars); + double con_scale = 1.0 / (bnd_norm + 1.0); + double obj_scale = 1.0 / (obj_norm + 1.0); - scale_problem(state, E, D, invE, invD); + scale_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->constraint_lower_bound_finite_val, + state->constraint_upper_bound_finite_val, + state->initial_dual_solution, + n_cons, + con_scale, + obj_scale); + + scale_obj_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->objective_vector, + state->variable_lower_bound, + state->variable_upper_bound, + state->variable_lower_bound_finite_val, + state->variable_upper_bound_finite_val, + state->initial_primal_solution, + n_vars, + con_scale, + obj_scale); + + rescale_info->con_bound_rescale = con_scale; + rescale_info->obj_vec_rescale = obj_scale; } rescale_info_t *rescale_problem( @@ -279,12 +305,15 @@ rescale_info_t *rescale_problem( printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha); pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); } + + rescale_info->con_bound_rescale = 1.0; + rescale_info->obj_vec_rescale = 1.0; if (params->bound_objective_rescaling) { printf("[Precondition] Bound-objective scaling\n"); - bound_objective_rescaling(state, rescale_info, E, D, invE, invD); + bound_objective_rescaling(state, rescale_info); } - + rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; CUDA_CHECK(cudaFree(E)); @@ -303,10 +332,10 @@ __global__ void scale_variables_kernel(double* __restrict__ c, double* __restrict__ primal_start, const double* __restrict__ D, const double* __restrict__ invD, - int n) + int n_vars) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; + if (j >= n_vars) return; double dj = D[j]; double inv_dj = invD[j]; c[j] *= inv_dj; @@ -324,10 +353,10 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, double* __restrict__ dual_start, const double* __restrict__ E, const double* __restrict__ invE, - int m) + int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= m) return; + if (i >= n_cons) return; double inv_ei = invE[i]; double ei = E[i]; con_lb[i] *= inv_ei; @@ -403,9 +432,9 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, double* __restrict__ cum, - int n) + int n_vars) { - for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n; t += blockDim.x * gridDim.x) + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) { double v = x[t]; double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); @@ -415,38 +444,87 @@ __global__ void clamp_sqrt_and_accum(double* __restrict__ x, } } -__global__ void reduce_bound_norm_sq_atomic( +__global__ void reduce_bound_norm_sq_kernel( const double* __restrict__ L, const double* __restrict__ U, - int m, - double* __restrict__ out_sum) + int n_cons, + double* __restrict__ block_sums) { + extern __shared__ double sdata[]; + int tid = threadIdx.x; + int global_tid = blockIdx.x * blockDim.x + tid; + int stride = blockDim.x * gridDim.x; + double acc = 0.0; - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) { + for (int i = global_tid; i < n_cons; i += stride) { double Li = L[i], Ui = U[i]; bool fL = isfinite(Li), fU = isfinite(Ui); - if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) acc += Li * Li; - if (fU) acc += Ui * Ui; + + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) { + acc += Li * Li; + } + if (fU) { + acc += Ui * Ui; + } + } + + sdata[tid] = acc; + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) { + block_sums[blockIdx.x] = sdata[0]; } - atomicAdd(out_sum, acc); } -__global__ void fill_const_and_accum_kernel(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - double val, - int n) +__global__ void scale_bounds_kernel( + double* __restrict__ con_lb, + double* __restrict__ con_ub, + double* __restrict__ con_lb_finite, + double* __restrict__ con_ub_finite, + double* __restrict__ dual_start, + int n_cons, + double con_scale, + double obj_scale) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + if (i >= n_cons) return; + con_lb[i] *= con_scale; + con_ub[i] *= con_scale; + con_lb_finite[i] *= con_scale; + con_ub_finite[i] *= con_scale; + dual_start[i] *= obj_scale; +} - x[i] = val; - inv_x[i] = 1.0 / val; - cum[i] *= val; +__global__ void scale_obj_kernel( + double* __restrict__ c, + double* __restrict__ var_lb, + double* __restrict__ var_ub, + double* __restrict__ var_lb_finite, + double* __restrict__ var_ub_finite, + double* __restrict__ primal_start, + int n_vars, + double con_scale, + double obj_scale) +{ + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n_vars) return; + var_lb[j] *= con_scale; + var_ub[j] *= con_scale; + var_lb_finite[j] *= con_scale; + var_ub_finite[j] *= con_scale; + c[j] *= obj_scale; + primal_start[j] *= con_scale; } -__global__ void fill_ones_kernel(double* __restrict__ x, int n) +__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) x[i] = 1.0; + if (i < n_vars) x[i] = 1.0; } diff --git a/src/solver.cu b/src/solver.cu index 031e514..b8f2ab1 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -37,38 +37,69 @@ __global__ void build_transpose_pos(const int* __restrict__ A_row_ind, const int* __restrict__ At_col_ind, int nnz, int* __restrict__ A_to_At); -__global__ void compute_next_pdhg_primal_solution_kernel( - const double *current_primal, double *reflected_primal, const double *dual_product, - const double *objective, const double *var_lb, const double *var_ub, - int n, double step_size); -__global__ void compute_next_pdhg_primal_solution_major_kernel( - const double *current_primal, double *pdhg_primal, double *reflected_primal, - const double *dual_product, const double *objective, const double *var_lb, - const double *var_ub, int n, double step_size, double *dual_slack); -__global__ void compute_next_pdhg_dual_solution_kernel( - const double *current_dual, double *reflected_dual, const double *primal_product, - const double *const_lb, const double *const_ub, int n, double step_size); -__global__ void compute_next_pdhg_dual_solution_major_kernel( - const double *current_dual, double *pdhg_dual, double *reflected_dual, - const double *primal_product, const double *const_lb, const double *const_ub, - int n, double step_size); -__global__ void halpern_update_kernel( - const double *initial_primal, double *current_primal, const double *reflected_primal, - const double *initial_dual, double *current_dual, const double *reflected_dual, - int n_vars, int n_cons, double weight, double reflection_coeff); -__global__ void rescale_solution_kernel( - double *primal_solution, double *dual_solution, - const double *variable_rescaling, const double *constraint_rescaling, - int n_vars, int n_cons); -__global__ void compute_delta_solution_kernel( - const double *initial_primal, const double *pdhg_primal, double *delta_primal, - const double *initial_dual, const double *pdhg_dual, double *delta_dual, - int n_vars, int n_cons); +__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, + double *reflected_primal, + const double *dual_product, + const double *objective, + const double *var_lb, + const double *var_ub, + int n, + double step_size); +__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, + double *pdhg_primal, + double *reflected_primal, + const double *dual_product, + const double *objective, + const double *var_lb, + const double *var_ub, + int n, + double step_size, + double *dual_slack); +__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, + double *reflected_dual, + const double *primal_product, + const double *const_lb, + const double *const_ub, + int n, + double step_size); +__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, + double *pdhg_dual, + double *reflected_dual, + const double *primal_product, + const double *const_lb, + const double *const_ub, + int n, + double step_size); +__global__ void halpern_update_kernel(const double *initial_primal, + double *current_primal, + const double *reflected_primal, + const double *initial_dual, + double *current_dual, + const double *reflected_dual, + int n_vars, + int n_cons, + double weight, + double reflection_coeff); +__global__ void rescale_solution_kernel(double *primal_solution, + double *dual_solution, + const double *variable_rescaling, + const double *constraint_rescaling, + const double objective_vector_rescaling, + const double constraint_bound_rescaling, + int n_vars, int n_cons); +__global__ void compute_delta_solution_kernel(const double *initial_primal, + const double *pdhg_primal, + double *delta_primal, + const double *initial_dual, + const double *pdhg_dual, + double *delta_dual, + int n_vars, + int n_cons); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient); static void rescale_solution(pdhg_solver_state_t *state); -static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state); +static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, const lp_problem_t *original_problem); static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params); static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params); static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_problem, const pdhg_parameters_t* params); @@ -112,7 +143,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, } pdhg_solver_state_t *state = initialize_solver_state(working_problem, params); - + initialize_step_size_and_primal_weight(state, params); clock_t start_time = clock(); @@ -265,8 +296,8 @@ static pdhg_solver_state_t *initialize_solver_state( CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, original_problem->constraint_matrix_num_nonzeros * sizeof(int))); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, original_problem->constraint_matrix_num_nonzeros * sizeof(double))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, working_problem->constraint_matrix_num_nonzeros * sizeof(int))); + CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, working_problem->constraint_matrix_num_nonzeros * sizeof(double))); size_t buffer_size = 0; void *buffer = nullptr; @@ -363,32 +394,10 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_rescaling = rescale_info->con_rescale; state->variable_rescaling = rescale_info->var_rescale; + state->constraint_bound_rescaling = rescale_info->con_bound_rescale; + state->objective_vector_rescaling = rescale_info->obj_vec_rescale; state->rescaling_time_sec = rescale_info->rescaling_time_sec; - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int))); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, working_problem->constraint_matrix_num_nonzeros * sizeof(int))); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, working_problem->constraint_matrix_num_nonzeros * sizeof(double))); - - size_t buffer_size = 0; - void *buffer = nullptr; - CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize( - state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, - state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, - CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size)); - CUDA_CHECK(cudaMalloc(&buffer, buffer_size)); - - CUSPARSE_CHECK(cusparseCsr2cscEx2( - state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros, - state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, - state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, - CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer)); - state->constraint_matrix_t->row_ind = NULL; - - CUDA_CHECK(cudaFree(buffer)); - rescale_info->con_rescale = NULL; rescale_info->var_rescale = NULL; rescale_info_free(rescale_info); @@ -403,7 +412,6 @@ static pdhg_solver_state_t *initialize_solver_state( double sum_of_squares = 0.0; double max_val = 0.0; double val = 0.0; - for (int i = 0; i < n_vars; ++i) { if (params->optimality_norm == NORM_TYPE_L_INF) { @@ -423,7 +431,6 @@ static pdhg_solver_state_t *initialize_solver_state( sum_of_squares = 0.0; max_val = 0.0; val = 0.0; - for (int i = 0; i < n_cons; ++i) { double lower = working_problem->constraint_lower_bound[i]; @@ -472,7 +479,7 @@ static pdhg_solver_state_t *initialize_solver_state( CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); - + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -645,18 +652,21 @@ __global__ void rescale_solution_kernel(double *primal_solution, double *dual_solution, const double *variable_rescaling, const double *constraint_rescaling, + const double objective_vector_rescaling, + const double constraint_bound_rescaling, int n_vars, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_vars) { primal_solution[i] = - primal_solution[i] / variable_rescaling[i]; + primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling; } else if (i < n_vars + n_cons) { int idx = i - n_vars; - dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx]; + dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] / + objective_vector_rescaling; } } @@ -766,6 +776,7 @@ static void rescale_solution(pdhg_solver_state_t *state) rescale_solution_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->pdhg_primal_solution, state->pdhg_dual_solution, state->variable_rescaling, state->constraint_rescaling, + state->objective_vector_rescaling, state->constraint_bound_rescaling, state->num_variables, state->num_constraints); } diff --git a/src/utils.cu b/src/utils.cu index bb046d0..e25f922 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -726,16 +726,21 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) state->absolute_dual_residual = get_vector_inf_norm(state->blas_handle, state->num_variables, state->dual_residual); } else { - CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables, + state->absolute_primal_residual /= state->constraint_bound_rescaling; + CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->dual_residual, 1, &state->absolute_dual_residual)); } + state->absolute_dual_residual /= state->objective_vector_rescaling; CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->pdhg_primal_solution, 1, &state->primal_objective_value)); - state->primal_objective_value = state->primal_objective_value + state->objective_constant; + state->primal_objective_value = + state->primal_objective_value / (state->constraint_bound_rescaling * + state->objective_vector_rescaling) + + state->objective_constant; double base_dual_objective; CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, @@ -744,7 +749,10 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) double dual_slack_sum = get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack); - state->dual_objective_value = base_dual_objective + dual_slack_sum + state->objective_constant; + state->dual_objective_value = (base_dual_objective + dual_slack_sum) / + (state->constraint_bound_rescaling * + state->objective_vector_rescaling) + + state->objective_constant; state->relative_primal_residual = state->absolute_primal_residual / (1.0 + state->constraint_bound_norm); @@ -804,6 +812,8 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) CUBLAS_CHECK(cublasDdot( state->blas_handle, state->num_variables, state->objective_vector, 1, state->delta_primal_solution, 1, &state->primal_ray_linear_objective)); + state->primal_ray_linear_objective /= + (state->constraint_bound_rescaling * state->objective_vector_rescaling); dual_solution_dual_objective_contribution_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( @@ -823,7 +833,9 @@ void compute_infeasibility_information(pdhg_solver_state_t *state) double sum_dual_slack = get_vector_sum(state->blas_handle, state->num_variables, state->ones_primal_d, state->dual_slack); - state->dual_ray_objective = sum_primal_slack + sum_dual_slack; + state->dual_ray_objective = + (sum_primal_slack + sum_dual_slack) / + (state->constraint_bound_rescaling * state->objective_vector_rescaling); compute_primal_infeasibility_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( diff --git a/test/test_infeasible_unbounded.py b/test/test_infeasible_unbounded.py index 4388cc4..d09d9a9 100644 --- a/test/test_infeasible_unbounded.py +++ b/test/test_infeasible_unbounded.py @@ -63,13 +63,13 @@ def test_infeasible_lp(base_lp_data, atol): # turn off output model.setParams(OutputFlag=False) # optimize - #model.optimize() + model.optimize() # check status - #assert hasattr(model, "Status"), "Model.Status not exposed." - #assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" - #assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" + assert hasattr(model, "Status"), "Model.Status not exposed." + assert model.Status == "PRIMAL_INFEASIBLE", f"Unexpected termination status: {model.Status}" + assert model.StatusCode == PDLP.PRIMAL_INFEASIBLE, f"Unexpected termination status code: {model.StatusCode}" # check dual ray - #assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" + assert model.DualRayObj > atol, f"DualRayObj should be positive for dual infeasible, got {model.DualRayObj}" def test_unbounded_lp(base_lp_data, atol): From 6be829f414ac789d84cd584b9495615e9379caac Mon Sep 17 00:00:00 2001 From: bolucastang Date: Fri, 21 Nov 2025 11:29:15 -0500 Subject: [PATCH 06/20] feat: add logging for precondition time --- src/utils.cu | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/utils.cu b/src/utils.cu index e25f922..21422ea 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -456,18 +456,19 @@ void pdhg_final_log( "--------------------\n"); } printf("Solution Summary\n"); - printf(" Status : %s\n", termination_reason_to_string(result->termination_reason)); + printf(" Status : %s\n", termination_reason_to_string(result->termination_reason)); if (params->presolve) { - printf(" Presolve time : %.3g sec\n", result->presolve_time); + printf(" Presolve time : %.3g sec\n", result->presolve_time); } - printf(" Solve time : %.3g sec\n", result->cumulative_time_sec); - printf(" Iterations : %d\n", result->total_count); - printf(" Primal objective : %.10g\n", result->primal_objective_value); - printf(" Dual objective : %.10g\n", result->dual_objective_value); - printf(" Objective gap : %.3e\n", result->relative_objective_gap); - printf(" Primal infeas : %.3e\n", result->relative_primal_residual); - printf(" Dual infeas : %.3e\n", result->relative_dual_residual); + printf(" Precondition time : %.5g sec\n", state->rescaling_time_sec); + printf(" Solve time : %.3g sec\n", result->cumulative_time_sec); + printf(" Iterations : %d\n", result->total_count); + printf(" Primal objective : %.10g\n", result->primal_objective_value); + printf(" Dual objective : %.10g\n", result->dual_objective_value); + printf(" Objective gap : %.3e\n", result->relative_objective_gap); + printf(" Primal infeas : %.3e\n", result->relative_primal_residual); + printf(" Dual infeas : %.3e\n", result->relative_dual_residual); // if (stats != NULL && stats->n_rows_original > 0) { // printf("\nPresolve Summary\n"); From 73d9e91bac785ceda3366dddd962352201235dab Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Tue, 25 Nov 2025 22:44:22 -0500 Subject: [PATCH 07/20] Bug fixed: numerical issue --- src/preconditioner.cu | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 80c77da..c042175 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -22,7 +22,7 @@ limitations under the License. #include #include -#define SCALING_EPSILON 1e-12 +#define SCALING_EPSILON 1e-8 __global__ void scale_variables_kernel(double* __restrict__ c, double* __restrict__ var_lb, @@ -60,7 +60,7 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, double* __restrict__ out_sum); __global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, - double* __restrict__ cum, + double* __restrict__ cum, int n_vars); __global__ void reduce_bound_norm_sq_kernel( const double* __restrict__ L, @@ -199,7 +199,7 @@ static void pock_chambolle_rescaling( invE, rescale_info->con_rescale, n_cons); - + csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, @@ -231,7 +231,7 @@ static void bound_objective_rescaling( state->constraint_upper_bound, n_cons, bnd_norm_sq_cuda); - + double bnd_norm_sq = 0.0; CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaFree(bnd_norm_sq_cuda)); @@ -276,7 +276,7 @@ rescale_info_t *rescale_problem( pdhg_solver_state_t *state) { printf("[Precondition] Start\n"); - + int n_vars = state->num_variables; int n_cons = state->num_constraints; @@ -313,14 +313,14 @@ rescale_info_t *rescale_problem( printf("[Precondition] Bound-objective scaling\n"); bound_objective_rescaling(state, rescale_info); } - + rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; CUDA_CHECK(cudaFree(E)); CUDA_CHECK(cudaFree(D)); CUDA_CHECK(cudaFree(invE)); CUDA_CHECK(cudaFree(invD)); - + return rescale_info; } @@ -376,7 +376,7 @@ __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, int nnz) { for (int k = blockIdx.x * blockDim.x + threadIdx.x; - k < nnz; + k < nnz; k += gridDim.x * blockDim.x) { int i = row_ids[k]; @@ -429,16 +429,16 @@ __global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, out_sum[i] = acc; } -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, +__global__ void clamp_sqrt_and_accum(double* __restrict__ x, double* __restrict__ inv_x, - double* __restrict__ cum, - int n_vars) + double* __restrict__ cum, + int n_vars) { for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) { - double v = x[t]; - double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); - cum[t] *= s; + double v = x[t]; + double s = sqrt(fmax(v, SCALING_EPSILON)); + cum[t] *= s; x[t] = s; inv_x[t] = 1.0 / s; } From 055a8fa4c866193c235c93c17c1fd66a78586011 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Thu, 27 Nov 2025 14:38:50 -0500 Subject: [PATCH 08/20] Clean code: clang-format and align naming style --- internal/internal_types.h | 2 +- src/preconditioner.cu | 343 ++++++++++++++++++++------------------ src/solver.cu | 156 ++++++++--------- src/utils.cu | 4 +- 4 files changed, 266 insertions(+), 239 deletions(-) diff --git a/internal/internal_types.h b/internal/internal_types.h index 3ce8c23..aa69005 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -30,7 +30,7 @@ typedef struct int *col_ind; int *row_ind; double *val; - int *transpose_pos; + int *transpose_map; } cu_sparse_matrix_csr_t; typedef struct diff --git a/src/preconditioner.cu b/src/preconditioner.cu index c042175..031051f 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -24,73 +24,73 @@ limitations under the License. #define SCALING_EPSILON 1e-8 -__global__ void scale_variables_kernel(double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, - const double* __restrict__ D, - const double* __restrict__ invD, +__global__ void scale_variables_kernel(double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, + const double *__restrict__ D, + const double *__restrict__ invD, int n_vars); -__global__ void scale_constraints_kernel(double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, - const double* __restrict__ E, - const double* __restrict__ invE, +__global__ void scale_constraints_kernel(double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, + const double *__restrict__ E, + const double *__restrict__ invE, int n_cons); -__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, - const int* __restrict__ col_ind, - double* __restrict__ A_vals, - double* __restrict__ At_vals, - const int* __restrict__ A_to_At, - const double* __restrict__ invD, - const double* __restrict__ invE, +__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, + const int *__restrict__ col_ind, + double *__restrict__ A_vals, + double *__restrict__ At_vals, + const int *__restrict__ A_to_At, + const double *__restrict__ invD, + const double *__restrict__ invE, int nnz); -__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double* __restrict__ out_max); -__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double degree, - double* __restrict__ out_sum); -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - int n_vars); +__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double *__restrict__ out_max); +__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double degree, + double *__restrict__ out_sum); +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, + double *__restrict__ inv_x, + double *__restrict__ cum, + int n_vars); __global__ void reduce_bound_norm_sq_kernel( - const double* __restrict__ L, - const double* __restrict__ U, + const double *__restrict__ L, + const double *__restrict__ U, int n_cons, - double* __restrict__ block_sums); + double *__restrict__ block_sums); __global__ void scale_bounds_kernel( - double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, + double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, int n_cons, double con_scale, double obj_scale); -__global__ void scale_obj_kernel( - double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, +__global__ void scale_objective_kernel( + double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, int n_vars, double con_scale, double obj_scale); -__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars); +__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars); static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); -static void pock_chambolle_rescaling(pdhg_solver_state_t *state, double alpha, rescale_info_t *rescale_info, +static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info, double *E, double *D, double *invE, double *invD); static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info); @@ -125,12 +125,12 @@ static void scale_problem( invE, n_cons); - csr_scale_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + scale_csr_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ind, state->constraint_matrix->col_ind, state->constraint_matrix->val, state->constraint_matrix_t->val, - state->constraint_matrix->transpose_pos, + state->constraint_matrix->transpose_map, invD, invE, state->constraint_matrix->num_nonzeros); @@ -150,23 +150,23 @@ static void ruiz_rescaling( for (int iter = 0; iter < num_iterations; ++iter) { - csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + compute_csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, n_cons, E); - clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( E, invE, rescale_info->con_rescale, n_cons); - csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + compute_csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, n_vars, D); - clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, invD, rescale_info->var_rescale, @@ -188,32 +188,31 @@ static void pock_chambolle_rescaling( const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + compute_csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, n_cons, alpha, E); - clamp_sqrt_and_accum<<num_blocks_dual, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( E, invE, rescale_info->con_rescale, n_cons); - csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + compute_csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, n_vars, 2.0 - alpha, D); - clamp_sqrt_and_accum<<num_blocks_primal, THREADS_PER_BLOCK>>>( + clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( D, invD, rescale_info->var_rescale, n_vars); scale_problem(state, E, D, invE, invD); - } static void bound_objective_rescaling( @@ -223,19 +222,19 @@ static void bound_objective_rescaling( const int n_cons = state->num_constraints; const int n_vars = state->num_variables; - double *bnd_norm_sq_cuda = nullptr; - CUDA_CHECK(cudaMalloc(&bnd_norm_sq_cuda, sizeof(double))); - CUDA_CHECK(cudaMemset(bnd_norm_sq_cuda, 0, sizeof(double))); + double *bnd_norm_sq_d = NULL; + CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double))); + CUDA_CHECK(cudaMemset(bnd_norm_sq_d, 0, sizeof(double))); reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( state->constraint_lower_bound, state->constraint_upper_bound, n_cons, - bnd_norm_sq_cuda); + bnd_norm_sq_d); - double bnd_norm_sq = 0.0; - CUDA_CHECK(cudaMemcpy(&bnd_norm_sq, bnd_norm_sq_cuda, sizeof(double), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(bnd_norm_sq_cuda)); - double bnd_norm = sqrt(bnd_norm_sq); + double bnd_norm_sq_h = 0.0; + CUDA_CHECK(cudaMemcpy(&bnd_norm_sq_h, bnd_norm_sq_d, sizeof(double), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(bnd_norm_sq_d)); + double bnd_norm = sqrt(bnd_norm_sq_h); double obj_norm = 0.0; CUBLAS_CHECK(cublasDnrm2(state->blas_handle, @@ -256,7 +255,7 @@ static void bound_objective_rescaling( con_scale, obj_scale); - scale_obj_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + scale_objective_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->objective_vector, state->variable_lower_bound, state->variable_upper_bound, @@ -268,7 +267,7 @@ static void bound_objective_rescaling( obj_scale); rescale_info->con_bound_rescale = con_scale; - rescale_info->obj_vec_rescale = obj_scale; + rescale_info->obj_vec_rescale = obj_scale; } rescale_info_t *rescale_problem( @@ -282,18 +281,18 @@ rescale_info_t *rescale_problem( clock_t start_rescaling = clock(); rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons*sizeof(double))); - CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars*sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons * sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars * sizeof(double))); fill_ones_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( rescale_info->con_rescale, n_cons); fill_ones_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( rescale_info->var_rescale, n_vars); - double *E=nullptr, *D=nullptr, *invE=nullptr, *invD=nullptr; - CUDA_CHECK(cudaMalloc(&E, n_cons*sizeof(double))); - CUDA_CHECK(cudaMalloc(&D, n_vars*sizeof(double))); - CUDA_CHECK(cudaMalloc(&invE, n_cons*sizeof(double))); - CUDA_CHECK(cudaMalloc(&invD, n_vars*sizeof(double))); + double *E = NULL, *D = NULL, *invE = NULL, *invD = NULL; + CUDA_CHECK(cudaMalloc(&E, n_cons * sizeof(double))); + CUDA_CHECK(cudaMalloc(&D, n_vars * sizeof(double))); + CUDA_CHECK(cudaMalloc(&invE, n_cons * sizeof(double))); + CUDA_CHECK(cudaMalloc(&invD, n_vars * sizeof(double))); if (params->l_inf_ruiz_iterations > 0) { @@ -324,21 +323,22 @@ rescale_info_t *rescale_problem( return rescale_info; } -__global__ void scale_variables_kernel(double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, - const double* __restrict__ D, - const double* __restrict__ invD, +__global__ void scale_variables_kernel(double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, + const double *__restrict__ D, + const double *__restrict__ invD, int n_vars) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) return; + if (j >= n_vars) + return; double dj = D[j]; double inv_dj = invD[j]; - c[j] *= inv_dj; + c[j] *= inv_dj; var_lb[j] *= dj; var_ub[j] *= dj; var_lb_finite[j] *= dj; @@ -346,17 +346,18 @@ __global__ void scale_variables_kernel(double* __restrict__ c, primal_start[j] *= dj; } -__global__ void scale_constraints_kernel(double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, - const double* __restrict__ E, - const double* __restrict__ invE, +__global__ void scale_constraints_kernel(double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, + const double *__restrict__ E, + const double *__restrict__ invE, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) return; + if (i >= n_cons) + return; double inv_ei = invE[i]; double ei = E[i]; con_lb[i] *= inv_ei; @@ -366,13 +367,13 @@ __global__ void scale_constraints_kernel(double* __restrict__ con_lb, dual_start[i] *= ei; } -__global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, - const int* __restrict__ col_ind, - double* __restrict__ A_vals, - double* __restrict__ At_vals, - const int* __restrict__ A_to_At, - const double* __restrict__ invD, - const double* __restrict__ invE, +__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, + const int *__restrict__ col_ind, + double *__restrict__ A_vals, + double *__restrict__ At_vals, + const int *__restrict__ A_to_At, + const double *__restrict__ invD, + const double *__restrict__ invE, int nnz) { for (int k = blockIdx.x * blockDim.x + threadIdx.x; @@ -387,52 +388,63 @@ __global__ void csr_scale_nnz_kernel(const int* __restrict__ row_ids, } } -__global__ void csr_row_absmax_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double* __restrict__ out_max) +__global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double *__restrict__ out_max) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; + if (i >= num_rows) + return; int s = row_ptr[i], e = row_ptr[i + 1]; double m = 0.0; - for (int k = s; k < e; ++k) { + for (int k = s; k < e; ++k) + { double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; - if (v > m) m = v; + if (!isfinite(v)) + v = 0.0; + if (v > m) + m = v; } out_max[i] = m; } -__device__ __forceinline__ double pow_fast(double v, double p) { - if (p == 2.0) return v * v; - if (p == 1.0) return v; - if (p == 0.5) return sqrt(v); +__device__ __forceinline__ double pow_fast(double v, double p) +{ + if (p == 2.0) + return v * v; + if (p == 1.0) + return v; + if (p == 0.5) + return sqrt(v); return pow(v, p); } -__global__ void csr_row_powsum_kernel(const int* __restrict__ row_ptr, - const double* __restrict__ vals, - int num_rows, - double degree, - double* __restrict__ out_sum) +__global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, + const double *__restrict__ vals, + int num_rows, + double degree, + double *__restrict__ out_sum) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; + if (i >= num_rows) + return; int s = row_ptr[i], e = row_ptr[i + 1]; double acc = 0.0; - for (int k = s; k < e; ++k) { + for (int k = s; k < e; ++k) + { double v = fabs(vals[k]); - if (!isfinite(v)) v = 0.0; + if (!isfinite(v)) + v = 0.0; acc += pow_fast(v, degree); } out_sum[i] = acc; } -__global__ void clamp_sqrt_and_accum(double* __restrict__ x, - double* __restrict__ inv_x, - double* __restrict__ cum, - int n_vars) +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, + double *__restrict__ inv_x, + double *__restrict__ cum, + int n_vars) { for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) { @@ -445,10 +457,10 @@ __global__ void clamp_sqrt_and_accum(double* __restrict__ x, } __global__ void reduce_bound_norm_sq_kernel( - const double* __restrict__ L, - const double* __restrict__ U, + const double *__restrict__ L, + const double *__restrict__ U, int n_cons, - double* __restrict__ block_sums) + double *__restrict__ block_sums) { extern __shared__ double sdata[]; int tid = threadIdx.x; @@ -456,14 +468,17 @@ __global__ void reduce_bound_norm_sq_kernel( int stride = blockDim.x * gridDim.x; double acc = 0.0; - for (int i = global_tid; i < n_cons; i += stride) { + for (int i = global_tid; i < n_cons; i += stride) + { double Li = L[i], Ui = U[i]; bool fL = isfinite(Li), fU = isfinite(Ui); - if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) { + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) + { acc += Li * Li; } - if (fU) { + if (fU) + { acc += Ui * Ui; } } @@ -471,60 +486,66 @@ __global__ void reduce_bound_norm_sq_kernel( sdata[tid] = acc; __syncthreads(); - for (int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { + for (int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (tid < s) + { sdata[tid] += sdata[tid + s]; } __syncthreads(); } - if (tid == 0) { + if (tid == 0) + { block_sums[blockIdx.x] = sdata[0]; } } __global__ void scale_bounds_kernel( - double* __restrict__ con_lb, - double* __restrict__ con_ub, - double* __restrict__ con_lb_finite, - double* __restrict__ con_ub_finite, - double* __restrict__ dual_start, + double *__restrict__ con_lb, + double *__restrict__ con_ub, + double *__restrict__ con_lb_finite, + double *__restrict__ con_ub_finite, + double *__restrict__ dual_start, int n_cons, double con_scale, double obj_scale) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) return; - con_lb[i] *= con_scale; - con_ub[i] *= con_scale; + if (i >= n_cons) + return; + con_lb[i] *= con_scale; + con_ub[i] *= con_scale; con_lb_finite[i] *= con_scale; con_ub_finite[i] *= con_scale; - dual_start[i] *= obj_scale; + dual_start[i] *= obj_scale; } -__global__ void scale_obj_kernel( - double* __restrict__ c, - double* __restrict__ var_lb, - double* __restrict__ var_ub, - double* __restrict__ var_lb_finite, - double* __restrict__ var_ub_finite, - double* __restrict__ primal_start, +__global__ void scale_objective_kernel( + double *__restrict__ c, + double *__restrict__ var_lb, + double *__restrict__ var_ub, + double *__restrict__ var_lb_finite, + double *__restrict__ var_ub_finite, + double *__restrict__ primal_start, int n_vars, double con_scale, double obj_scale) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) return; - var_lb[j] *= con_scale; - var_ub[j] *= con_scale; + if (j >= n_vars) + return; + var_lb[j] *= con_scale; + var_ub[j] *= con_scale; var_lb_finite[j] *= con_scale; var_ub_finite[j] *= con_scale; - c[j] *= obj_scale; - primal_start[j] *= con_scale; + c[j] *= obj_scale; + primal_start[j] *= con_scale; } -__global__ void fill_ones_kernel(double* __restrict__ x, int n_vars) +__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_vars) x[i] = 1.0; + if (i < n_vars) + x[i] = 1.0; } diff --git a/src/solver.cu b/src/solver.cu index b8f2ab1..d2a6a5a 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -28,57 +28,57 @@ limitations under the License. #include #include -__global__ void build_row_ind(const int* __restrict__ row_ptr, +__global__ void build_row_ind(const int *__restrict__ row_ptr, int num_rows, - int* __restrict__ row_ind); -__global__ void build_transpose_pos(const int* __restrict__ A_row_ind, - const int* __restrict__ A_col_ind, - const int* __restrict__ At_row_ptr, - const int* __restrict__ At_col_ind, + int *__restrict__ row_ind); +__global__ void build_transpose_map(const int *__restrict__ A_row_ind, + const int *__restrict__ A_col_ind, + const int *__restrict__ At_row_ptr, + const int *__restrict__ At_col_ind, int nnz, - int* __restrict__ A_to_At); -__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, - double *reflected_primal, + int *__restrict__ A_to_At); +__global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, + double *reflected_primal, const double *dual_product, - const double *objective, - const double *var_lb, + const double *objective, + const double *var_lb, const double *var_ub, - int n, + int n, double step_size); -__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, - double *pdhg_primal, +__global__ void compute_next_pdhg_primal_solution_major_kernel(const double *current_primal, + double *pdhg_primal, double *reflected_primal, - const double *dual_product, - const double *objective, + const double *dual_product, + const double *objective, const double *var_lb, - const double *var_ub, - int n, - double step_size, + const double *var_ub, + int n, + double step_size, double *dual_slack); -__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, - double *reflected_dual, +__global__ void compute_next_pdhg_dual_solution_kernel(const double *current_dual, + double *reflected_dual, const double *primal_product, - const double *const_lb, - const double *const_ub, - int n, + const double *const_lb, + const double *const_ub, + int n, double step_size); -__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, - double *pdhg_dual, +__global__ void compute_next_pdhg_dual_solution_major_kernel(const double *current_dual, + double *pdhg_dual, double *reflected_dual, - const double *primal_product, - const double *const_lb, + const double *primal_product, + const double *const_lb, const double *const_ub, - int n, + int n, double step_size); -__global__ void halpern_update_kernel(const double *initial_primal, - double *current_primal, +__global__ void halpern_update_kernel(const double *initial_primal, + double *current_primal, const double *reflected_primal, - const double *initial_dual, - double *current_dual, + const double *initial_dual, + double *current_dual, const double *reflected_dual, - int n_vars, - int n_cons, - double weight, + int n_vars, + int n_cons, + double weight, double reflection_coeff); __global__ void rescale_solution_kernel(double *primal_solution, double *dual_solution, @@ -87,13 +87,13 @@ __global__ void rescale_solution_kernel(double *primal_solution, const double objective_vector_rescaling, const double constraint_bound_rescaling, int n_vars, int n_cons); -__global__ void compute_delta_solution_kernel(const double *initial_primal, - const double *pdhg_primal, +__global__ void compute_delta_solution_kernel(const double *initial_primal, + const double *pdhg_primal, double *delta_primal, - const double *initial_dual, - const double *pdhg_dual, + const double *initial_dual, + const double *pdhg_dual, double *delta_dual, - int n_vars, + int n_vars, int n_cons); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); @@ -277,7 +277,7 @@ static pdhg_solver_state_t *initialize_solver_state( state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - + CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle)); CUBLAS_CHECK(cublasCreate(&state->blas_handle)); CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST)); @@ -323,15 +323,15 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_matrix_t->row_ptr, n_vars, state->constraint_matrix_t->row_ind); CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_pos, nnz * sizeof(int))); - state->constraint_matrix_t->transpose_pos = NULL; - build_transpose_pos<<num_blocks_nnz, THREADS_PER_BLOCK>>>( + CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_map, nnz * sizeof(int))); + state->constraint_matrix_t->transpose_map = NULL; + build_transpose_map<<num_blocks_nnz, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ind, state->constraint_matrix->col_ind, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, nnz, - state->constraint_matrix->transpose_pos); + state->constraint_matrix->transpose_map); CUDA_CHECK(cudaGetLastError()); ALLOC_AND_COPY(state->variable_lower_bound, working_problem->variable_lower_bound, var_bytes); @@ -393,10 +393,10 @@ static pdhg_solver_state_t *initialize_solver_state( rescale_info_t *rescale_info = rescale_problem(params, state); state->constraint_rescaling = rescale_info->con_rescale; - state->variable_rescaling = rescale_info->var_rescale; + state->variable_rescaling = rescale_info->var_rescale; state->constraint_bound_rescaling = rescale_info->con_bound_rescale; state->objective_vector_rescaling = rescale_info->obj_vec_rescale; - state->rescaling_time_sec = rescale_info->rescaling_time_sec; + state->rescaling_time_sec = rescale_info->rescaling_time_sec; rescale_info->con_rescale = NULL; rescale_info->var_rescale = NULL; @@ -406,8 +406,8 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_matrix->row_ind = NULL; CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); state->constraint_matrix_t->row_ind = NULL; - CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); - state->constraint_matrix->transpose_pos = NULL; + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); + state->constraint_matrix->transpose_map = NULL; double sum_of_squares = 0.0; double max_val = 0.0; @@ -479,7 +479,7 @@ static pdhg_solver_state_t *initialize_solver_state( CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F)); CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size)); - + CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size)); CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size)); CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -520,49 +520,55 @@ static pdhg_solver_state_t *initialize_solver_state( return state; } -__global__ void build_row_ind(const int* __restrict__ row_ptr, +__global__ void build_row_ind(const int *__restrict__ row_ptr, int num_rows, - int* __restrict__ row_ind) + int *__restrict__ row_ind) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) return; + if (i >= num_rows) + return; int s = row_ptr[i]; int e = row_ptr[i + 1]; - for (int k = s; k < e; ++k) { + for (int k = s; k < e; ++k) + { row_ind[k] = i; } } -__global__ void build_transpose_pos( - const int* __restrict__ A_row_ind, - const int* __restrict__ A_col_ind, - const int* __restrict__ At_row_ptr, - const int* __restrict__ At_col_ind, +__global__ void build_transpose_map( + const int *__restrict__ A_row_ind, + const int *__restrict__ A_col_ind, + const int *__restrict__ At_row_ptr, + const int *__restrict__ At_col_ind, int nnz, - int* __restrict__ A_to_At) + int *__restrict__ A_to_At) { int k = blockIdx.x * blockDim.x + threadIdx.x; - if (k >= nnz) return; + if (k >= nnz) + return; - int i = A_row_ind[k]; - int j = A_col_ind[k]; + int i = A_row_ind[k]; + int j = A_col_ind[k]; int start = At_row_ptr[j]; - int end = At_row_ptr[j + 1]; + int end = At_row_ptr[j + 1]; int pos = -1; - for (int idx = start; idx < end; ++idx) { - if (At_col_ind[idx] == i) { + for (int idx = start; idx < end; ++idx) + { + if (At_col_ind[idx] == i) + { pos = idx; break; } } - if (pos < 0) return; + if (pos < 0) + return; - A_to_At[k] = pos; + A_to_At[k] = pos; } __global__ void compute_next_pdhg_primal_solution_kernel( @@ -927,7 +933,7 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) if (state->vec_dual_prod) CUSPARSE_CHECK(cusparseDestroyDnVec(state->vec_dual_prod)); if (state->primal_spmv_buffer) - CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); + CUDA_CHECK(cudaFree(state->primal_spmv_buffer)); if (state->dual_spmv_buffer) CUDA_CHECK(cudaFree(state->dual_spmv_buffer)); if (state->sparse_handle) @@ -949,8 +955,8 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); if (state->constraint_matrix->val) CUDA_CHECK(cudaFree(state->constraint_matrix->val)); - if (state->constraint_matrix->transpose_pos) - CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_pos)); + if (state->constraint_matrix->transpose_map) + CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); if (state->constraint_matrix_t->row_ptr) CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ptr)); if (state->constraint_matrix_t->col_ind) @@ -959,8 +965,8 @@ void pdhg_solver_state_free(pdhg_solver_state_t *state) CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); if (state->constraint_matrix_t->val) CUDA_CHECK(cudaFree(state->constraint_matrix_t->val)); - if (state->constraint_matrix_t->transpose_pos) - CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_pos)); + if (state->constraint_matrix_t->transpose_map) + CUDA_CHECK(cudaFree(state->constraint_matrix_t->transpose_map)); if (state->constraint_lower_bound) CUDA_CHECK(cudaFree(state->constraint_lower_bound)); if (state->constraint_upper_bound) @@ -1566,4 +1572,4 @@ static void compute_dual_fixed_point_error(pdhg_solver_state_t *state) 1, &dual_norm)); state->fixed_point_error = dual_norm * dual_norm / state->primal_weight; -} +} \ No newline at end of file diff --git a/src/utils.cu b/src/utils.cu index 21422ea..347b879 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -459,9 +459,9 @@ void pdhg_final_log( printf(" Status : %s\n", termination_reason_to_string(result->termination_reason)); if (params->presolve) { - printf(" Presolve time : %.3g sec\n", result->presolve_time); + printf(" Presolve time : %.3g sec\n", result->presolve_time); } - printf(" Precondition time : %.5g sec\n", state->rescaling_time_sec); + printf(" Precondition time : %.5g sec\n", result->rescaling_time_sec); printf(" Solve time : %.3g sec\n", result->cumulative_time_sec); printf(" Iterations : %d\n", result->total_count); printf(" Primal objective : %.10g\n", result->primal_objective_value); From 0742da9f0d49a1b62e25fb4b83d3dfeca0ef4b1b Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Thu, 27 Nov 2025 18:57:02 -0500 Subject: [PATCH 09/20] Clean code: rename variables --- src/preconditioner.cu | 468 +++++++++++++++++++++--------------------- 1 file changed, 234 insertions(+), 234 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 031051f..1f711cf 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -24,85 +24,85 @@ limitations under the License. #define SCALING_EPSILON 1e-8 -__global__ void scale_variables_kernel(double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - const double *__restrict__ D, - const double *__restrict__ invD, - int n_vars); -__global__ void scale_constraints_kernel(double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - const double *__restrict__ E, - const double *__restrict__ invE, - int n_cons); -__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, - const int *__restrict__ col_ind, - double *__restrict__ A_vals, - double *__restrict__ At_vals, - const int *__restrict__ A_to_At, - const double *__restrict__ invD, - const double *__restrict__ invE, +__global__ void scale_variables_kernel(double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + const double *__restrict__ variable_rescaling, + const double *__restrict__ inverse_variable_rescaling, + int num_variables); +__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + const double *__restrict__ constraint_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int num_constraints); +__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind, + const int *__restrict__ constraint_col_ind, + double *__restrict__ constraint_matrix_val, + double *__restrict__ constraint_matrix_transpose_val, + const int *__restrict__ constraint_to_transpose_position, + const double *__restrict__ inverse_variable_rescaling, + const double *__restrict__ inverse_constraint_rescaling, int nnz); __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, - double *__restrict__ out_max); + double *__restrict__ row_absmax_values); __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, double degree, - double *__restrict__ out_sum); -__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, - double *__restrict__ inv_x, - double *__restrict__ cum, - int n_vars); + double *__restrict__ row_powsum_values); +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors, + double *__restrict__ inverse_scaling_factors, + double *__restrict__ cumulative_rescaling, + int num_variables); __global__ void reduce_bound_norm_sq_kernel( - const double *__restrict__ L, - const double *__restrict__ U, - int n_cons, + const double *__restrict__ constraint_lower_bound, + const double *__restrict__ constraint_upper_bound, + int num_constraints, double *__restrict__ block_sums); __global__ void scale_bounds_kernel( - double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - int n_cons, - double con_scale, - double obj_scale); + double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + int num_constraints, + double constraint_scale, + double objective_scale); __global__ void scale_objective_kernel( - double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - int n_vars, - double con_scale, - double obj_scale); -__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars); -static void scale_problem(pdhg_solver_state_t *state, double *E, double *D, double *invE, double *invD); + double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + int num_variables, + double constraint_scale, + double objective_scale); +__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables); +static void scale_problem(pdhg_solver_state_t *state, double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info, - double *E, double *D, double *invE, double *invD); + double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info, - double *E, double *D, double *invE, double *invD); + double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling); static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info); static void scale_problem( pdhg_solver_state_t *state, - double *E, - double *D, - double *invE, - double *invD) + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) { - int n_vars = state->num_variables; - int n_cons = state->num_constraints; + int num_variables = state->num_variables; + int num_constraints = state->num_constraints; scale_variables_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->objective_vector, @@ -111,9 +111,9 @@ static void scale_problem( state->variable_lower_bound_finite_val, state->variable_upper_bound_finite_val, state->initial_primal_solution, - D, - invD, - n_vars); + variable_rescaling, + inverse_variable_rescaling, + num_variables); scale_constraints_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, @@ -121,9 +121,9 @@ static void scale_problem( state->constraint_lower_bound_finite_val, state->constraint_upper_bound_finite_val, state->initial_dual_solution, - E, - invE, - n_cons); + constraint_rescaling, + inverse_constraint_rescaling, + num_constraints); scale_csr_nnz_kernel<<num_blocks_nnz, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ind, @@ -131,8 +131,8 @@ static void scale_problem( state->constraint_matrix->val, state->constraint_matrix_t->val, state->constraint_matrix->transpose_map, - invD, - invE, + inverse_variable_rescaling, + inverse_constraint_rescaling, state->constraint_matrix->num_nonzeros); } @@ -140,39 +140,39 @@ static void ruiz_rescaling( pdhg_solver_state_t *state, int num_iterations, rescale_info_t *rescale_info, - double *E, - double *D, - double *invE, - double *invD) + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) { - const int n_cons = state->num_constraints; - const int n_vars = state->num_variables; + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; for (int iter = 0; iter < num_iterations; ++iter) { compute_csr_row_absmax_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, - n_cons, - E); + num_constraints, + constraint_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - E, - invE, + constraint_rescaling, + inverse_constraint_rescaling, rescale_info->con_rescale, - n_cons); + num_constraints); compute_csr_row_absmax_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, - n_vars, - D); + num_variables, + variable_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - D, - invD, + variable_rescaling, + inverse_variable_rescaling, rescale_info->var_rescale, - n_vars); + num_variables); - scale_problem(state, E, D, invE, invD); + scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } } @@ -180,47 +180,47 @@ static void pock_chambolle_rescaling( pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info, - double *E, - double *D, - double *invE, - double *invD) + double *constraint_rescaling, + double *variable_rescaling, + double *inverse_constraint_rescaling, + double *inverse_variable_rescaling) { - const int n_cons = state->num_constraints; - const int n_vars = state->num_variables; + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; compute_csr_row_powsum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->val, - n_cons, + num_constraints, alpha, - E); + constraint_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - E, - invE, + constraint_rescaling, + inverse_constraint_rescaling, rescale_info->con_rescale, - n_cons); + num_constraints); compute_csr_row_powsum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, - n_vars, + num_variables, 2.0 - alpha, - D); + variable_rescaling); clamp_sqrt_and_accum_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - D, - invD, + variable_rescaling, + inverse_variable_rescaling, rescale_info->var_rescale, - n_vars); + num_variables); - scale_problem(state, E, D, invE, invD); + scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } static void bound_objective_rescaling( pdhg_solver_state_t *state, rescale_info_t *rescale_info) { - const int n_cons = state->num_constraints; - const int n_vars = state->num_variables; + const int num_constraints = state->num_constraints; + const int num_variables = state->num_variables; double *bnd_norm_sq_d = NULL; CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double))); @@ -228,7 +228,7 @@ static void bound_objective_rescaling( reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( state->constraint_lower_bound, state->constraint_upper_bound, - n_cons, + num_constraints, bnd_norm_sq_d); double bnd_norm_sq_h = 0.0; @@ -242,8 +242,8 @@ static void bound_objective_rescaling( state->objective_vector, 1, &obj_norm)); - double con_scale = 1.0 / (bnd_norm + 1.0); - double obj_scale = 1.0 / (obj_norm + 1.0); + double constraint_scale = 1.0 / (bnd_norm + 1.0); + double objective_scale = 1.0 / (obj_norm + 1.0); scale_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, @@ -251,9 +251,9 @@ static void bound_objective_rescaling( state->constraint_lower_bound_finite_val, state->constraint_upper_bound_finite_val, state->initial_dual_solution, - n_cons, - con_scale, - obj_scale); + num_constraints, + constraint_scale, + objective_scale); scale_objective_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->objective_vector, @@ -262,12 +262,12 @@ static void bound_objective_rescaling( state->variable_lower_bound_finite_val, state->variable_upper_bound_finite_val, state->initial_primal_solution, - n_vars, - con_scale, - obj_scale); + num_variables, + constraint_scale, + objective_scale); - rescale_info->con_bound_rescale = con_scale; - rescale_info->obj_vec_rescale = obj_scale; + rescale_info->con_bound_rescale = constraint_scale; + rescale_info->obj_vec_rescale = objective_scale; } rescale_info_t *rescale_problem( @@ -276,33 +276,33 @@ rescale_info_t *rescale_problem( { printf("[Precondition] Start\n"); - int n_vars = state->num_variables; - int n_cons = state->num_constraints; + int num_variables = state->num_variables; + int num_constraints = state->num_constraints; clock_t start_rescaling = clock(); rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t)); - CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, n_cons * sizeof(double))); - CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, n_vars * sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, num_variables * sizeof(double))); fill_ones_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - rescale_info->con_rescale, n_cons); + rescale_info->con_rescale, num_constraints); fill_ones_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( - rescale_info->var_rescale, n_vars); + rescale_info->var_rescale, num_variables); - double *E = NULL, *D = NULL, *invE = NULL, *invD = NULL; - CUDA_CHECK(cudaMalloc(&E, n_cons * sizeof(double))); - CUDA_CHECK(cudaMalloc(&D, n_vars * sizeof(double))); - CUDA_CHECK(cudaMalloc(&invE, n_cons * sizeof(double))); - CUDA_CHECK(cudaMalloc(&invD, n_vars * sizeof(double))); + double *constraint_rescaling = NULL, *variable_rescaling = NULL, *inverse_constraint_rescaling = NULL, *inverse_variable_rescaling = NULL; + CUDA_CHECK(cudaMalloc(&constraint_rescaling, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&variable_rescaling, num_variables * sizeof(double))); + CUDA_CHECK(cudaMalloc(&inverse_constraint_rescaling, num_constraints * sizeof(double))); + CUDA_CHECK(cudaMalloc(&inverse_variable_rescaling, num_variables * sizeof(double))); if (params->l_inf_ruiz_iterations > 0) { printf("[Precondition] Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations); - ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, E, D, invE, invD); + ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } if (params->has_pock_chambolle_alpha) { printf("[Precondition] Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha); - pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, E, D, invE, invD); + pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling); } rescale_info->con_bound_rescale = 1.0; @@ -315,83 +315,83 @@ rescale_info_t *rescale_problem( rescale_info->rescaling_time_sec = (double)(clock() - start_rescaling) / CLOCKS_PER_SEC; - CUDA_CHECK(cudaFree(E)); - CUDA_CHECK(cudaFree(D)); - CUDA_CHECK(cudaFree(invE)); - CUDA_CHECK(cudaFree(invD)); + CUDA_CHECK(cudaFree(constraint_rescaling)); + CUDA_CHECK(cudaFree(variable_rescaling)); + CUDA_CHECK(cudaFree(inverse_constraint_rescaling)); + CUDA_CHECK(cudaFree(inverse_variable_rescaling)); return rescale_info; } -__global__ void scale_variables_kernel(double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - const double *__restrict__ D, - const double *__restrict__ invD, - int n_vars) +__global__ void scale_variables_kernel(double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + const double *__restrict__ variable_rescaling, + const double *__restrict__ inverse_variable_rescaling, + int num_variables) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) + if (j >= num_variables) return; - double dj = D[j]; - double inv_dj = invD[j]; - c[j] *= inv_dj; - var_lb[j] *= dj; - var_ub[j] *= dj; - var_lb_finite[j] *= dj; - var_ub_finite[j] *= dj; - primal_start[j] *= dj; + double dj = variable_rescaling[j]; + double inv_dj = inverse_variable_rescaling[j]; + objective_vector[j] *= inv_dj; + variable_lower_bound[j] *= dj; + variable_upper_bound[j] *= dj; + variable_lower_bound_finite_val[j] *= dj; + variable_upper_bound_finite_val[j] *= dj; + initial_primal_solution[j] *= dj; } -__global__ void scale_constraints_kernel(double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - const double *__restrict__ E, - const double *__restrict__ invE, - int n_cons) +__global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + const double *__restrict__ constraint_rescaling, + const double *__restrict__ inverse_constraint_rescaling, + int num_constraints) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) + if (i >= num_constraints) return; - double inv_ei = invE[i]; - double ei = E[i]; - con_lb[i] *= inv_ei; - con_ub[i] *= inv_ei; - con_lb_finite[i] *= inv_ei; - con_ub_finite[i] *= inv_ei; - dual_start[i] *= ei; + double inv_ei = inverse_constraint_rescaling[i]; + double ei = constraint_rescaling[i]; + constraint_lower_bound[i] *= inv_ei; + constraint_upper_bound[i] *= inv_ei; + constraint_lower_bound_finite_val[i] *= inv_ei; + constraint_upper_bound_finite_val[i] *= inv_ei; + initial_dual_solution[i] *= ei; } -__global__ void scale_csr_nnz_kernel(const int *__restrict__ row_ids, - const int *__restrict__ col_ind, - double *__restrict__ A_vals, - double *__restrict__ At_vals, - const int *__restrict__ A_to_At, - const double *__restrict__ invD, - const double *__restrict__ invE, +__global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind, + const int *__restrict__ constraint_col_ind, + double *__restrict__ constraint_matrix_val, + double *__restrict__ constraint_matrix_transpose_val, + const int *__restrict__ constraint_to_transpose_position, + const double *__restrict__ inverse_variable_rescaling, + const double *__restrict__ inverse_constraint_rescaling, int nnz) { for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x) { - int i = row_ids[k]; - int j = col_ind[k]; - double scale = invD[j] * invE[i]; - A_vals[k] *= scale; - At_vals[A_to_At[k]] *= scale; + int i = constraint_row_ind[k]; + int j = constraint_col_ind[k]; + double scale = inverse_variable_rescaling[j] * inverse_constraint_rescaling[i]; + constraint_matrix_val[k] *= scale; + constraint_matrix_transpose_val[constraint_to_transpose_position[k]] *= scale; } } __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, - double *__restrict__ out_max) + double *__restrict__ row_absmax_values) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= num_rows) @@ -400,13 +400,13 @@ __global__ void compute_csr_row_absmax_kernel(const int *__restrict__ row_ptr, double m = 0.0; for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); + double v = fabs(matrix_vals[k]); if (!isfinite(v)) v = 0.0; if (v > m) m = v; } - out_max[i] = m; + row_absmax_values[i] = m; } __device__ __forceinline__ double pow_fast(double v, double p) @@ -421,10 +421,10 @@ __device__ __forceinline__ double pow_fast(double v, double p) } __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, - const double *__restrict__ vals, + const double *__restrict__ matrix_vals, int num_rows, double degree, - double *__restrict__ out_sum) + double *__restrict__ row_powsum_values) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= num_rows) @@ -433,33 +433,33 @@ __global__ void compute_csr_row_powsum_kernel(const int *__restrict__ row_ptr, double acc = 0.0; for (int k = s; k < e; ++k) { - double v = fabs(vals[k]); + double v = fabs(matrix_vals[k]); if (!isfinite(v)) v = 0.0; acc += pow_fast(v, degree); } - out_sum[i] = acc; + row_powsum_values[i] = acc; } -__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ x, - double *__restrict__ inv_x, - double *__restrict__ cum, - int n_vars) +__global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors, + double *__restrict__ inverse_scaling_factors, + double *__restrict__ cumulative_rescaling, + int num_variables) { - for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < n_vars; t += blockDim.x * gridDim.x) + for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x) { - double v = x[t]; + double v = scaling_factors[t]; double s = sqrt(fmax(v, SCALING_EPSILON)); - cum[t] *= s; - x[t] = s; - inv_x[t] = 1.0 / s; + cumulative_rescaling[t] *= s; + scaling_factors[t] = s; + inverse_scaling_factors[t] = 1.0 / s; } } __global__ void reduce_bound_norm_sq_kernel( - const double *__restrict__ L, - const double *__restrict__ U, - int n_cons, + const double *__restrict__ constraint_lower_bound, + const double *__restrict__ constraint_upper_bound, + int num_constraints, double *__restrict__ block_sums) { extern __shared__ double sdata[]; @@ -468,9 +468,9 @@ __global__ void reduce_bound_norm_sq_kernel( int stride = blockDim.x * gridDim.x; double acc = 0.0; - for (int i = global_tid; i < n_cons; i += stride) + for (int i = global_tid; i < num_constraints; i += stride) { - double Li = L[i], Ui = U[i]; + double Li = constraint_lower_bound[i], Ui = constraint_upper_bound[i]; bool fL = isfinite(Li), fU = isfinite(Ui); if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) @@ -502,50 +502,50 @@ __global__ void reduce_bound_norm_sq_kernel( } __global__ void scale_bounds_kernel( - double *__restrict__ con_lb, - double *__restrict__ con_ub, - double *__restrict__ con_lb_finite, - double *__restrict__ con_ub_finite, - double *__restrict__ dual_start, - int n_cons, - double con_scale, - double obj_scale) + double *__restrict__ constraint_lower_bound, + double *__restrict__ constraint_upper_bound, + double *__restrict__ constraint_lower_bound_finite_val, + double *__restrict__ constraint_upper_bound_finite_val, + double *__restrict__ initial_dual_solution, + int num_constraints, + double constraint_scale, + double objective_scale) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n_cons) + if (i >= num_constraints) return; - con_lb[i] *= con_scale; - con_ub[i] *= con_scale; - con_lb_finite[i] *= con_scale; - con_ub_finite[i] *= con_scale; - dual_start[i] *= obj_scale; + constraint_lower_bound[i] *= constraint_scale; + constraint_upper_bound[i] *= constraint_scale; + constraint_lower_bound_finite_val[i] *= constraint_scale; + constraint_upper_bound_finite_val[i] *= constraint_scale; + initial_dual_solution[i] *= objective_scale; } __global__ void scale_objective_kernel( - double *__restrict__ c, - double *__restrict__ var_lb, - double *__restrict__ var_ub, - double *__restrict__ var_lb_finite, - double *__restrict__ var_ub_finite, - double *__restrict__ primal_start, - int n_vars, - double con_scale, - double obj_scale) + double *__restrict__ objective_vector, + double *__restrict__ variable_lower_bound, + double *__restrict__ variable_upper_bound, + double *__restrict__ variable_lower_bound_finite_val, + double *__restrict__ variable_upper_bound_finite_val, + double *__restrict__ initial_primal_solution, + int num_variables, + double constraint_scale, + double objective_scale) { int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n_vars) + if (j >= num_variables) return; - var_lb[j] *= con_scale; - var_ub[j] *= con_scale; - var_lb_finite[j] *= con_scale; - var_ub_finite[j] *= con_scale; - c[j] *= obj_scale; - primal_start[j] *= con_scale; + variable_lower_bound[j] *= constraint_scale; + variable_upper_bound[j] *= constraint_scale; + variable_lower_bound_finite_val[j] *= constraint_scale; + variable_upper_bound_finite_val[j] *= constraint_scale; + objective_vector[j] *= objective_scale; + initial_primal_solution[j] *= constraint_scale; } -__global__ void fill_ones_kernel(double *__restrict__ x, int n_vars) +__global__ void fill_ones_kernel(double *__restrict__ x, int num_variables) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_vars) + if (i < num_variables) x[i] = 1.0; } From 3917ee641f533d3331a8bede6f072b052a7326cc Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Thu, 27 Nov 2025 19:27:37 -0500 Subject: [PATCH 10/20] Refactor: move finite-bound computation --- src/preconditioner.cu | 32 -------------------------------- src/solver.cu | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 1f711cf..fe087fc 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -27,16 +27,12 @@ limitations under the License. __global__ void scale_variables_kernel(double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, const double *__restrict__ variable_rescaling, const double *__restrict__ inverse_variable_rescaling, int num_variables); __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, const double *__restrict__ constraint_rescaling, const double *__restrict__ inverse_constraint_rescaling, @@ -70,8 +66,6 @@ __global__ void reduce_bound_norm_sq_kernel( __global__ void scale_bounds_kernel( double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, int num_constraints, double constraint_scale, @@ -80,8 +74,6 @@ __global__ void scale_objective_kernel( double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, int num_variables, double constraint_scale, @@ -108,8 +100,6 @@ static void scale_problem( state->objective_vector, state->variable_lower_bound, state->variable_upper_bound, - state->variable_lower_bound_finite_val, - state->variable_upper_bound_finite_val, state->initial_primal_solution, variable_rescaling, inverse_variable_rescaling, @@ -118,8 +108,6 @@ static void scale_problem( scale_constraints_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, state->constraint_upper_bound, - state->constraint_lower_bound_finite_val, - state->constraint_upper_bound_finite_val, state->initial_dual_solution, constraint_rescaling, inverse_constraint_rescaling, @@ -248,8 +236,6 @@ static void bound_objective_rescaling( scale_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, state->constraint_upper_bound, - state->constraint_lower_bound_finite_val, - state->constraint_upper_bound_finite_val, state->initial_dual_solution, num_constraints, constraint_scale, @@ -259,8 +245,6 @@ static void bound_objective_rescaling( state->objective_vector, state->variable_lower_bound, state->variable_upper_bound, - state->variable_lower_bound_finite_val, - state->variable_upper_bound_finite_val, state->initial_primal_solution, num_variables, constraint_scale, @@ -326,8 +310,6 @@ rescale_info_t *rescale_problem( __global__ void scale_variables_kernel(double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, const double *__restrict__ variable_rescaling, const double *__restrict__ inverse_variable_rescaling, @@ -341,15 +323,11 @@ __global__ void scale_variables_kernel(double *__restrict__ objective_vector, objective_vector[j] *= inv_dj; variable_lower_bound[j] *= dj; variable_upper_bound[j] *= dj; - variable_lower_bound_finite_val[j] *= dj; - variable_upper_bound_finite_val[j] *= dj; initial_primal_solution[j] *= dj; } __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, const double *__restrict__ constraint_rescaling, const double *__restrict__ inverse_constraint_rescaling, @@ -362,8 +340,6 @@ __global__ void scale_constraints_kernel(double *__restrict__ constraint_lower_b double ei = constraint_rescaling[i]; constraint_lower_bound[i] *= inv_ei; constraint_upper_bound[i] *= inv_ei; - constraint_lower_bound_finite_val[i] *= inv_ei; - constraint_upper_bound_finite_val[i] *= inv_ei; initial_dual_solution[i] *= ei; } @@ -504,8 +480,6 @@ __global__ void reduce_bound_norm_sq_kernel( __global__ void scale_bounds_kernel( double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, - double *__restrict__ constraint_lower_bound_finite_val, - double *__restrict__ constraint_upper_bound_finite_val, double *__restrict__ initial_dual_solution, int num_constraints, double constraint_scale, @@ -516,8 +490,6 @@ __global__ void scale_bounds_kernel( return; constraint_lower_bound[i] *= constraint_scale; constraint_upper_bound[i] *= constraint_scale; - constraint_lower_bound_finite_val[i] *= constraint_scale; - constraint_upper_bound_finite_val[i] *= constraint_scale; initial_dual_solution[i] *= objective_scale; } @@ -525,8 +497,6 @@ __global__ void scale_objective_kernel( double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, double *__restrict__ variable_upper_bound, - double *__restrict__ variable_lower_bound_finite_val, - double *__restrict__ variable_upper_bound_finite_val, double *__restrict__ initial_primal_solution, int num_variables, double constraint_scale, @@ -537,8 +507,6 @@ __global__ void scale_objective_kernel( return; variable_lower_bound[j] *= constraint_scale; variable_upper_bound[j] *= constraint_scale; - variable_lower_bound_finite_val[j] *= constraint_scale; - variable_upper_bound_finite_val[j] *= constraint_scale; objective_vector[j] *= objective_scale; initial_primal_solution[j] *= constraint_scale; } diff --git a/src/solver.cu b/src/solver.cu index d2a6a5a..b54ae01 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -37,6 +37,11 @@ __global__ void build_transpose_map(const int *__restrict__ A_row_ind, const int *__restrict__ At_col_ind, int nnz, int *__restrict__ A_to_At); +__global__ void fill_finite_bounds_kernel(const double *__restrict__ lower_bound, + const double *__restrict__ upper_bound, + double *__restrict__ lower_bound_finite_val, + double *__restrict__ upper_bound_finite_val, + int num_elements); __global__ void compute_next_pdhg_primal_solution_kernel(const double *current_primal, double *reflected_primal, const double *dual_product, @@ -402,6 +407,25 @@ static pdhg_solver_state_t *initialize_solver_state( rescale_info->var_rescale = NULL; rescale_info_free(rescale_info); + CUDA_CHECK(cudaMalloc(&state->constraint_lower_bound_finite_val, con_bytes)); + CUDA_CHECK(cudaMalloc(&state->constraint_upper_bound_finite_val, con_bytes)); + CUDA_CHECK(cudaMalloc(&state->variable_lower_bound_finite_val, var_bytes)); + CUDA_CHECK(cudaMalloc(&state->variable_upper_bound_finite_val, var_bytes)); + + fill_finite_bounds_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + state->constraint_lower_bound_finite_val, + state->constraint_upper_bound_finite_val, + n_cons); + + fill_finite_bounds_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->variable_lower_bound, + state->variable_upper_bound, + state->variable_lower_bound_finite_val, + state->variable_upper_bound_finite_val, + n_vars); + CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); state->constraint_matrix->row_ind = NULL; CUDA_CHECK(cudaFree(state->constraint_matrix_t->row_ind)); @@ -571,6 +595,24 @@ __global__ void build_transpose_map( A_to_At[k] = pos; } +__global__ void fill_finite_bounds_kernel( + const double *__restrict__ lb, + const double *__restrict__ ub, + double *__restrict__ lb_finite, + double *__restrict__ ub_finite, + int num_elements) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_elements) + return; + + double Li = lb[i]; + double Ui = ub[i]; + + lb_finite[i] = isfinite(Li) ? Li : 0.0; + ub_finite[i] = isfinite(Ui) ? Ui : 0.0; +} + __global__ void compute_next_pdhg_primal_solution_kernel( const double *current_primal, double *reflected_primal, const double *dual_product, const double *objective, const double *var_lb, From dddf0dfde409c1ee155c73c804744a542ffbc4e8 Mon Sep 17 00:00:00 2001 From: LucasBoTang Date: Fri, 28 Nov 2025 22:47:19 -0500 Subject: [PATCH 11/20] Numerical issue: back to old version --- src/preconditioner.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index fe087fc..0151334 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -22,7 +22,7 @@ limitations under the License. #include #include -#define SCALING_EPSILON 1e-8 +#define SCALING_EPSILON 1e-12 __global__ void scale_variables_kernel(double *__restrict__ objective_vector, double *__restrict__ variable_lower_bound, @@ -425,7 +425,7 @@ __global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_variables; t += blockDim.x * gridDim.x) { double v = scaling_factors[t]; - double s = sqrt(fmax(v, SCALING_EPSILON)); + double s = (v < SCALING_EPSILON) ? 1.0 : sqrt(v); cumulative_rescaling[t] *= s; scaling_factors[t] = s; inverse_scaling_factors[t] = 1.0 / s; From a2651be01fe84f7256a556cee7d5fe3eec47ee31 Mon Sep 17 00:00:00 2001 From: bolucastang Date: Fri, 12 Dec 2025 01:30:14 -0500 Subject: [PATCH 12/20] Code refactor: CUB DeviceReduce for bound norm --- src/preconditioner.cu | 74 +++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 0151334..b628253 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -21,6 +21,7 @@ limitations under the License. #include #include #include +#include #define SCALING_EPSILON 1e-12 @@ -58,11 +59,11 @@ __global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors double *__restrict__ inverse_scaling_factors, double *__restrict__ cumulative_rescaling, int num_variables); -__global__ void reduce_bound_norm_sq_kernel( +__global__ void compute_bound_contrib_kernel( const double *__restrict__ constraint_lower_bound, const double *__restrict__ constraint_upper_bound, int num_constraints, - double *__restrict__ block_sums); + double *__restrict__ contrib); __global__ void scale_bounds_kernel( double *__restrict__ constraint_lower_bound, double *__restrict__ constraint_upper_bound, @@ -210,14 +211,23 @@ static void bound_objective_rescaling( const int num_constraints = state->num_constraints; const int num_variables = state->num_variables; - double *bnd_norm_sq_d = NULL; - CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double))); - CUDA_CHECK(cudaMemset(bnd_norm_sq_d, 0, sizeof(double))); - reduce_bound_norm_sq_kernel<<<1, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(double)>>>( + double *contrib_d = nullptr; + CUDA_CHECK(cudaMalloc(&contrib_d, num_constraints * sizeof(double))); + compute_bound_contrib_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_lower_bound, state->constraint_upper_bound, num_constraints, - bnd_norm_sq_d); + contrib_d); + + double *bnd_norm_sq_d = nullptr; + CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double)));; + void *temp_storage = nullptr; + size_t temp_bytes = 0; + CUDA_CHECK(cub::DeviceReduce::Sum(temp_storage, temp_bytes, contrib_d, bnd_norm_sq_d, num_constraints)); + CUDA_CHECK(cudaMalloc(&temp_storage, temp_bytes)); + CUDA_CHECK(cub::DeviceReduce::Sum(temp_storage, temp_bytes, contrib_d, bnd_norm_sq_d, num_constraints)); + CUDA_CHECK(cudaFree(contrib_d)); + CUDA_CHECK(cudaFree(temp_storage)); double bnd_norm_sq_h = 0.0; CUDA_CHECK(cudaMemcpy(&bnd_norm_sq_h, bnd_norm_sq_d, sizeof(double), cudaMemcpyDeviceToHost)); @@ -432,49 +442,29 @@ __global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors } } -__global__ void reduce_bound_norm_sq_kernel( +__global__ void compute_bound_contrib_kernel( const double *__restrict__ constraint_lower_bound, const double *__restrict__ constraint_upper_bound, int num_constraints, - double *__restrict__ block_sums) + double *__restrict__ contrib) { - extern __shared__ double sdata[]; - int tid = threadIdx.x; - int global_tid = blockIdx.x * blockDim.x + tid; - int stride = blockDim.x * gridDim.x; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_constraints) return; + + double Li = constraint_lower_bound[i]; + double Ui = constraint_upper_bound[i]; + bool fL = isfinite(Li); + bool fU = isfinite(Ui); double acc = 0.0; - for (int i = global_tid; i < num_constraints; i += stride) - { - double Li = constraint_lower_bound[i], Ui = constraint_upper_bound[i]; - bool fL = isfinite(Li), fU = isfinite(Ui); - - if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) - { - acc += Li * Li; - } - if (fU) - { - acc += Ui * Ui; - } - } - sdata[tid] = acc; - __syncthreads(); + // follow the existing semantics + if (fL && (!fU || fabs(Li - Ui) > SCALING_EPSILON)) + acc += Li * Li; + if (fU) + acc += Ui * Ui; - for (int s = blockDim.x / 2; s > 0; s >>= 1) - { - if (tid < s) - { - sdata[tid] += sdata[tid + s]; - } - __syncthreads(); - } - - if (tid == 0) - { - block_sums[blockIdx.x] = sdata[0]; - } + contrib[i] = acc; } __global__ void scale_bounds_kernel( From 2f7b90ca14fc11188a549c0ec907bd1e9b09ac90 Mon Sep 17 00:00:00 2001 From: Bo Tang Date: Tue, 16 Dec 2025 14:58:09 -0500 Subject: [PATCH 13/20] Bug fixed: typo --- src/cli.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/cli.c b/src/cli.c index f0bfe4e..b42f591 100644 --- a/src/cli.c +++ b/src/cli.c @@ -102,21 +102,6 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir, } fprintf(outfile, "Termination Reason: %s\n", termination_reason_to_string(result->termination_reason)); - fprintf(outfile, "Runtime (sec): %e\n", result->cumulative_time_sec); - fprintf(outfile, "Iterations Count: %d\n", result->total_count); - fprintf(outfile, "Primal Objective Value: %e\n", - result->primal_objective_value); - fprintf(outfile, "Dual Objective Value: %e\n", result->dual_objective_value); - fprintf(outfile, "Relative Primal Residual: %e\n", - result->relative_primal_residual); - fprintf(outfile, "Relative Dual Residual: %e\n", - result->relative_dual_residual); - fprintf(outfile, "Absolute Objective Gap: %e\n", result->objective_gap); - fprintf(outfile, "Relative Objective Gap: %e\n", - result->relative_objective_gap); - fprintf(outfile, "Rows: %d\n", result->num_constraints); - fprintf(outfile, "Columns: %d\n", result->num_variables); - fprintf(outfile, "Nonzeros: %d\n", result->num_nonzeros); if (result->presolve_time > 0.0) { fprintf(outfile, "Presolve Status: %s\n", get_presolve_status_str(result->presolve_status)); @@ -142,6 +127,22 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir, // fprintf(outfile, "Postsolve Time (sec): %e\n", result->presolve_stats.time_postsolve); // } } + fprintf(outfile, "Precondition time (sec): %e\n", result->rescaling_time_sec); + fprintf(outfile, "Runtime (sec): %e\n", result->cumulative_time_sec); + fprintf(outfile, "Iterations Count: %d\n", result->total_count); + fprintf(outfile, "Primal Objective Value: %e\n", + result->primal_objective_value); + fprintf(outfile, "Dual Objective Value: %e\n", result->dual_objective_value); + fprintf(outfile, "Relative Primal Residual: %e\n", + result->relative_primal_residual); + fprintf(outfile, "Relative Dual Residual: %e\n", + result->relative_dual_residual); + fprintf(outfile, "Absolute Objective Gap: %e\n", result->objective_gap); + fprintf(outfile, "Relative Objective Gap: %e\n", + result->relative_objective_gap); + fprintf(outfile, "Rows: %d\n", result->num_constraints); + fprintf(outfile, "Columns: %d\n", result->num_variables); + fprintf(outfile, "Nonzeros: %d\n", result->num_nonzeros); if (result->feasibility_polishing_time > 0.0) { fprintf(outfile, "Feasibility Polishing Time (sec): %e\n", result->feasibility_polishing_time); From 90e23535dc4468f9e786fb85afd1a7fff71c2f42 Mon Sep 17 00:00:00 2001 From: bolucastang Date: Wed, 17 Dec 2025 02:22:30 -0500 Subject: [PATCH 14/20] Clean code: Limit CUB headers to device_reduce to reduce compile time --- src/preconditioner.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/preconditioner.cu b/src/preconditioner.cu index b628253..ea1f226 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -21,7 +21,7 @@ limitations under the License. #include #include #include -#include +#include #define SCALING_EPSILON 1e-12 From f213b5e94099ba146383a8a69f42451a5db66801 Mon Sep 17 00:00:00 2001 From: bolucastang Date: Mon, 12 Jan 2026 19:15:25 -0500 Subject: [PATCH 15/20] Clean code --- internal/utils.h | 2 -- src/preconditioner.cu | 2 +- src/solver.cu | 30 ++++-------------------------- src/utils.cu | 30 +++++++----------------------- 4 files changed, 12 insertions(+), 52 deletions(-) diff --git a/internal/utils.h b/internal/utils.h index 548d61f..1b6b7c9 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -138,8 +138,6 @@ extern "C" void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm); void set_default_parameters(pdhg_parameters_t *params); - - int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz); #ifdef __cplusplus } diff --git a/src/preconditioner.cu b/src/preconditioner.cu index ea1f226..ab4b209 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -220,7 +220,7 @@ static void bound_objective_rescaling( contrib_d); double *bnd_norm_sq_d = nullptr; - CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double)));; + CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double))); void *temp_storage = nullptr; size_t temp_bytes = 0; CUDA_CHECK(cub::DeviceReduce::Sum(temp_storage, temp_bytes, contrib_d, bnd_norm_sq_d, num_constraints)); diff --git a/src/solver.cu b/src/solver.cu index b54ae01..044b44b 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -165,8 +165,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, compute_infeasibility_information(state); } - state->cumulative_time_sec = - (double)(clock() - start_time) / CLOCKS_PER_SEC; + state->cumulative_time_sec = (double)(clock() - start_time) / CLOCKS_PER_SEC; check_termination_criteria(state, ¶ms->termination_criteria); display_iteration_stats(state, params->verbose); @@ -270,11 +269,11 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_matrix->num_rows = n_cons; state->constraint_matrix->num_cols = n_vars; - state->constraint_matrix->num_nonzeros = working_problem->constraint_matrix_num_nonzeros; + state->constraint_matrix->num_nonzeros = nnz; state->constraint_matrix_t->num_rows = n_vars; state->constraint_matrix_t->num_cols = n_cons; - state->constraint_matrix_t->num_nonzeros = working_problem->constraint_matrix_num_nonzeros; + state->constraint_matrix_t->num_nonzeros = nnz; state->termination_reason = TERMINATION_REASON_UNSPECIFIED; @@ -338,28 +337,7 @@ static pdhg_solver_state_t *initialize_solver_state( nnz, state->constraint_matrix->transpose_map); CUDA_CHECK(cudaGetLastError()); - - ALLOC_AND_COPY(state->variable_lower_bound, working_problem->variable_lower_bound, var_bytes); - ALLOC_AND_COPY(state->variable_upper_bound, working_problem->variable_upper_bound, var_bytes); - ALLOC_AND_COPY(state->objective_vector, working_problem->objective_vector, var_bytes); - ALLOC_AND_COPY(state->constraint_lower_bound, working_problem->constraint_lower_bound, con_bytes); - ALLOC_AND_COPY(state->constraint_upper_bound, working_problem->constraint_upper_bound, con_bytes); - - double *temp_host = (double *)safe_malloc(fmax(var_bytes, con_bytes)); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = isfinite(working_problem->constraint_lower_bound[i]) ? working_problem->constraint_lower_bound[i] : 0.0; - ALLOC_AND_COPY(state->constraint_lower_bound_finite_val, temp_host, con_bytes); - for (int i = 0; i < n_cons; ++i) - temp_host[i] = isfinite(working_problem->constraint_upper_bound[i]) ? working_problem->constraint_upper_bound[i] : 0.0; - ALLOC_AND_COPY(state->constraint_upper_bound_finite_val, temp_host, con_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = isfinite(working_problem->variable_lower_bound[i]) ? working_problem->variable_lower_bound[i] : 0.0; - ALLOC_AND_COPY(state->variable_lower_bound_finite_val, temp_host, var_bytes); - for (int i = 0; i < n_vars; ++i) - temp_host[i] = isfinite(working_problem->variable_upper_bound[i]) ? working_problem->variable_upper_bound[i] : 0.0; - ALLOC_AND_COPY(state->variable_upper_bound_finite_val, temp_host, var_bytes); - free(temp_host); - + #define ALLOC_ZERO(dest, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ CUDA_CHECK(cudaMemset(dest, 0, bytes)); diff --git a/src/utils.cu b/src/utils.cu index 347b879..6961935 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -728,7 +728,7 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm) state->num_variables, state->dual_residual); } else { state->absolute_primal_residual /= state->constraint_bound_rescaling; - CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables, + CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->dual_residual, 1, &state->absolute_dual_residual)); } @@ -1214,7 +1214,7 @@ __global__ void compute_primal_feas_polish_residual_kernel( } } -__global__ void compute_dual_feas_polish_residual_kerenl( +__global__ void compute_dual_feas_polish_residual_kernel( double *dual_residual, const double *dual_solution, const double *dual_product, @@ -1242,8 +1242,8 @@ __global__ void compute_dual_feas_polish_residual_kerenl( void compute_primal_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm) { - cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution); - cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product); + CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution)); + CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product)); CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer)); @@ -1271,12 +1271,12 @@ void compute_primal_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm) { - cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution); - cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product); + CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution)); + CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product)); CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer)); - compute_dual_feas_polish_residual_kerenl<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( + compute_dual_feas_polish_residual_kernel<<num_blocks_primal_dual, THREADS_PER_BLOCK>>>( state->dual_residual, state->pdhg_dual_solution, state->dual_product, @@ -1304,19 +1304,3 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_so double dual_slack_sum = get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack); state->dual_objective_value = (base_dual_objective + dual_slack_sum) / (state->constraint_bound_rescaling * state->objective_vector_rescaling) + state->objective_constant; } - -int* build_row_ind_from_row_ptr(int* row_ptr, int num_rows, int nnz) -{ - if (!row_ptr || num_rows < 0 || nnz <= 0) return NULL; - - int* row_ind = (int*)safe_malloc(nnz * sizeof(int)); - for (int i = 0; i < num_rows; ++i) { - int s = row_ptr[i]; - int e = row_ptr[i + 1]; - for (int k = s; k < e; ++k) { - row_ind[k] = i; - } - } - return row_ind; -} - From 4bd6ba863b4960b599536761943b113b1cc7cd4e Mon Sep 17 00:00:00 2001 From: bolucastang Date: Mon, 12 Jan 2026 22:02:00 -0500 Subject: [PATCH 16/20] Bug fixed: warm start solution --- src/solver.cu | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/solver.cu b/src/solver.cu index 044b44b..4b57b36 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -385,6 +385,19 @@ static pdhg_solver_state_t *initialize_solver_state( rescale_info->var_rescale = NULL; rescale_info_free(rescale_info); + CUDA_CHECK(cudaMemcpy(state->current_primal_solution, + state->initial_primal_solution, + var_bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(state->current_dual_solution, + state->initial_dual_solution, + con_bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(state->pdhg_primal_solution, + state->initial_primal_solution, + var_bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(state->pdhg_dual_solution, + state->initial_dual_solution, + con_bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMalloc(&state->constraint_lower_bound_finite_val, con_bytes)); CUDA_CHECK(cudaMalloc(&state->constraint_upper_bound_finite_val, con_bytes)); CUDA_CHECK(cudaMalloc(&state->variable_lower_bound_finite_val, var_bytes)); From d5e5be1bccdd8a12763d587c2aa21cb90629e9f1 Mon Sep 17 00:00:00 2001 From: bolucastang Date: Mon, 12 Jan 2026 22:11:58 -0500 Subject: [PATCH 17/20] Bug fixed: illegal memory access in bound --- src/solver.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/solver.cu b/src/solver.cu index 4b57b36..7080de7 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -337,6 +337,12 @@ static pdhg_solver_state_t *initialize_solver_state( nnz, state->constraint_matrix->transpose_map); CUDA_CHECK(cudaGetLastError()); + + ALLOC_AND_COPY(state->variable_lower_bound, working_problem->variable_lower_bound, var_bytes); + ALLOC_AND_COPY(state->variable_upper_bound, working_problem->variable_upper_bound, var_bytes); + ALLOC_AND_COPY(state->objective_vector, working_problem->objective_vector, var_bytes); + ALLOC_AND_COPY(state->constraint_lower_bound, working_problem->constraint_lower_bound, con_bytes); + ALLOC_AND_COPY(state->constraint_upper_bound, working_problem->constraint_upper_bound, con_bytes); #define ALLOC_ZERO(dest, bytes) \ CUDA_CHECK(cudaMalloc(&dest, bytes)); \ From d372a2078f4c8d899b7ff33c157f405e4ed5f881 Mon Sep 17 00:00:00 2001 From: bolucastang Date: Mon, 12 Jan 2026 23:12:47 -0500 Subject: [PATCH 18/20] Bug fixed: norm computation on scaled bound --- src/solver.cu | 158 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 112 insertions(+), 46 deletions(-) diff --git a/src/solver.cu b/src/solver.cu index 7080de7..c488877 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -29,7 +29,7 @@ limitations under the License. #include __global__ void build_row_ind(const int *__restrict__ row_ptr, - int num_rows, + int n, int *__restrict__ row_ind); __global__ void build_transpose_map(const int *__restrict__ A_row_ind, const int *__restrict__ A_col_ind, @@ -37,6 +37,14 @@ __global__ void build_transpose_map(const int *__restrict__ A_row_ind, const int *__restrict__ At_col_ind, int nnz, int *__restrict__ A_to_At); +__global__ void constraint_bound_linf_kernel(const double* __restrict__ lb, + const double* __restrict__ ub, + int n, + double* __restrict__ out); +__global__ void constraint_bound_l2_kernel(const double* __restrict__ lb, + const double* __restrict__ ub, + int n, + double* __restrict__ out); __global__ void fill_finite_bounds_kernel(const double *__restrict__ lower_bound, const double *__restrict__ upper_bound, double *__restrict__ lower_bound_finite_val, @@ -415,6 +423,7 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_lower_bound_finite_val, state->constraint_upper_bound_finite_val, n_cons); + CUDA_CHECK(cudaGetLastError()); fill_finite_bounds_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->variable_lower_bound, @@ -422,6 +431,7 @@ static pdhg_solver_state_t *initialize_solver_state( state->variable_lower_bound_finite_val, state->variable_upper_bound_finite_val, n_vars); + CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); state->constraint_matrix->row_ind = NULL; @@ -430,56 +440,64 @@ static pdhg_solver_state_t *initialize_solver_state( CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); state->constraint_matrix->transpose_map = NULL; - double sum_of_squares = 0.0; - double max_val = 0.0; - double val = 0.0; - for (int i = 0; i < n_vars; ++i) + double obj_norm = 0.0; + if (params->optimality_norm == NORM_TYPE_L_INF) { - if (params->optimality_norm == NORM_TYPE_L_INF) { - val = fabs(working_problem->objective_vector[i]); - if (val > max_val) max_val = val; - } else { - sum_of_squares += working_problem->objective_vector[i] * working_problem->objective_vector[i]; - } - } - - if (params->optimality_norm == NORM_TYPE_L_INF) { - state->objective_vector_norm = max_val; + int max_idx = 0; + CUBLAS_CHECK(cublasIdamax(state->blas_handle, + n_vars, + state->objective_vector, 1, + &max_idx)); + double v = 0.0; + CUDA_CHECK(cudaMemcpy(&v, + state->objective_vector + (max_idx - 1), + sizeof(double), + cudaMemcpyDeviceToHost)); + obj_norm = fabs(v); } else { - state->objective_vector_norm = sqrt(sum_of_squares); + CUBLAS_CHECK(cublasDnrm2(state->blas_handle, + state->num_variables, + state->objective_vector, 1, + &obj_norm)); } + state->objective_vector_norm = obj_norm; - sum_of_squares = 0.0; - max_val = 0.0; - val = 0.0; - for (int i = 0; i < n_cons; ++i) + double* constraint_bound_contrib = NULL; + CUDA_CHECK(cudaMalloc(&constraint_bound_contrib, n_cons * sizeof(double))); + if (params->optimality_norm == NORM_TYPE_L_INF) { - double lower = working_problem->constraint_lower_bound[i]; - double upper = working_problem->constraint_upper_bound[i]; - - if (params->optimality_norm == NORM_TYPE_L_INF) { - if (isfinite(lower) && (lower != upper)) { - val = fabs(lower); - if (val > max_val) max_val = val; - } - if (isfinite(upper)) { - val = fabs(upper); - if (val > max_val) max_val = val; - } - } else { - if (isfinite(lower) && (lower != upper)) { - sum_of_squares += lower * lower; - } - if (isfinite(upper)) { - sum_of_squares += upper * upper; - } - } - } - if (params->optimality_norm == NORM_TYPE_L_INF) { - state->constraint_bound_norm = max_val; + constraint_bound_linf_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + n_cons, + constraint_bound_contrib); + CUDA_CHECK(cudaGetLastError()); + int max_idx = 0; + CUBLAS_CHECK(cublasIdamax(state->blas_handle, + n_cons, + constraint_bound_contrib, 1, + &max_idx)); + double v = 0.0; + CUDA_CHECK(cudaMemcpy(&v, + constraint_bound_contrib + (max_idx - 1), + sizeof(double), + cudaMemcpyDeviceToHost)); + state->constraint_bound_norm = fabs(v); } else { - state->constraint_bound_norm = sqrt(sum_of_squares); + constraint_bound_l2_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_lower_bound, + state->constraint_upper_bound, + n_cons, + constraint_bound_contrib); + CUDA_CHECK(cudaGetLastError()); + double sum_sq = 0.0; + CUBLAS_CHECK(cublasDasum(state->blas_handle, + n_cons, + constraint_bound_contrib, 1, + &sum_sq)); + state->constraint_bound_norm = sqrt(sum_sq); } + CUDA_CHECK(cudaFree(constraint_bound_contrib)); state->best_primal_dual_residual_gap = INFINITY; state->last_trial_fixed_point_error = INFINITY; @@ -542,11 +560,11 @@ static pdhg_solver_state_t *initialize_solver_state( } __global__ void build_row_ind(const int *__restrict__ row_ptr, - int num_rows, + int n, int *__restrict__ row_ind) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_rows) + if (i >= n) return; int s = row_ptr[i]; @@ -592,6 +610,54 @@ __global__ void build_transpose_map( A_to_At[k] = pos; } +__global__ void constraint_bound_linf_kernel( + const double* __restrict__ lb, + const double* __restrict__ ub, + int n, + double* __restrict__ out) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + double lower = lb[i]; + double upper = ub[i]; + double v = 0.0; + + if (isfinite(lower) && (lower != upper)) { + v = fabs(lower); + } + if (isfinite(upper)) { + double u = fabs(upper); + if (u > v) v = u; + } + + out[i] = v; +} + +__global__ void constraint_bound_l2_kernel( + const double* __restrict__ lb, + const double* __restrict__ ub, + int n, + double* __restrict__ out) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + double lower = lb[i]; + double upper = ub[i]; + + double acc = 0.0; + + if (isfinite(lower) && (lower != upper)) { + acc += lower * lower; + } + if (isfinite(upper)) { + acc += upper * upper; + } + + out[i] = acc; +} + __global__ void fill_finite_bounds_kernel( const double *__restrict__ lb, const double *__restrict__ ub, From ea36c2aed6ba9def8aa88bfec26dfd7e38952a4d Mon Sep 17 00:00:00 2001 From: bolucastang Date: Tue, 13 Jan 2026 09:56:20 -0500 Subject: [PATCH 19/20] Revert "Bug fixed: norm computation on scaled bound" This reverts commit d372a2078f4c8d899b7ff33c157f405e4ed5f881. --- src/solver.cu | 158 +++++++++++++++----------------------------------- 1 file changed, 46 insertions(+), 112 deletions(-) diff --git a/src/solver.cu b/src/solver.cu index c488877..7080de7 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -29,7 +29,7 @@ limitations under the License. #include __global__ void build_row_ind(const int *__restrict__ row_ptr, - int n, + int num_rows, int *__restrict__ row_ind); __global__ void build_transpose_map(const int *__restrict__ A_row_ind, const int *__restrict__ A_col_ind, @@ -37,14 +37,6 @@ __global__ void build_transpose_map(const int *__restrict__ A_row_ind, const int *__restrict__ At_col_ind, int nnz, int *__restrict__ A_to_At); -__global__ void constraint_bound_linf_kernel(const double* __restrict__ lb, - const double* __restrict__ ub, - int n, - double* __restrict__ out); -__global__ void constraint_bound_l2_kernel(const double* __restrict__ lb, - const double* __restrict__ ub, - int n, - double* __restrict__ out); __global__ void fill_finite_bounds_kernel(const double *__restrict__ lower_bound, const double *__restrict__ upper_bound, double *__restrict__ lower_bound_finite_val, @@ -423,7 +415,6 @@ static pdhg_solver_state_t *initialize_solver_state( state->constraint_lower_bound_finite_val, state->constraint_upper_bound_finite_val, n_cons); - CUDA_CHECK(cudaGetLastError()); fill_finite_bounds_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( state->variable_lower_bound, @@ -431,7 +422,6 @@ static pdhg_solver_state_t *initialize_solver_state( state->variable_lower_bound_finite_val, state->variable_upper_bound_finite_val, n_vars); - CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind)); state->constraint_matrix->row_ind = NULL; @@ -440,64 +430,56 @@ static pdhg_solver_state_t *initialize_solver_state( CUDA_CHECK(cudaFree(state->constraint_matrix->transpose_map)); state->constraint_matrix->transpose_map = NULL; - double obj_norm = 0.0; - if (params->optimality_norm == NORM_TYPE_L_INF) + double sum_of_squares = 0.0; + double max_val = 0.0; + double val = 0.0; + for (int i = 0; i < n_vars; ++i) { - int max_idx = 0; - CUBLAS_CHECK(cublasIdamax(state->blas_handle, - n_vars, - state->objective_vector, 1, - &max_idx)); - double v = 0.0; - CUDA_CHECK(cudaMemcpy(&v, - state->objective_vector + (max_idx - 1), - sizeof(double), - cudaMemcpyDeviceToHost)); - obj_norm = fabs(v); + if (params->optimality_norm == NORM_TYPE_L_INF) { + val = fabs(working_problem->objective_vector[i]); + if (val > max_val) max_val = val; + } else { + sum_of_squares += working_problem->objective_vector[i] * working_problem->objective_vector[i]; + } + } + + if (params->optimality_norm == NORM_TYPE_L_INF) { + state->objective_vector_norm = max_val; } else { - CUBLAS_CHECK(cublasDnrm2(state->blas_handle, - state->num_variables, - state->objective_vector, 1, - &obj_norm)); + state->objective_vector_norm = sqrt(sum_of_squares); } - state->objective_vector_norm = obj_norm; - double* constraint_bound_contrib = NULL; - CUDA_CHECK(cudaMalloc(&constraint_bound_contrib, n_cons * sizeof(double))); - if (params->optimality_norm == NORM_TYPE_L_INF) + sum_of_squares = 0.0; + max_val = 0.0; + val = 0.0; + for (int i = 0; i < n_cons; ++i) { - constraint_bound_linf_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - state->constraint_lower_bound, - state->constraint_upper_bound, - n_cons, - constraint_bound_contrib); - CUDA_CHECK(cudaGetLastError()); - int max_idx = 0; - CUBLAS_CHECK(cublasIdamax(state->blas_handle, - n_cons, - constraint_bound_contrib, 1, - &max_idx)); - double v = 0.0; - CUDA_CHECK(cudaMemcpy(&v, - constraint_bound_contrib + (max_idx - 1), - sizeof(double), - cudaMemcpyDeviceToHost)); - state->constraint_bound_norm = fabs(v); + double lower = working_problem->constraint_lower_bound[i]; + double upper = working_problem->constraint_upper_bound[i]; + + if (params->optimality_norm == NORM_TYPE_L_INF) { + if (isfinite(lower) && (lower != upper)) { + val = fabs(lower); + if (val > max_val) max_val = val; + } + if (isfinite(upper)) { + val = fabs(upper); + if (val > max_val) max_val = val; + } + } else { + if (isfinite(lower) && (lower != upper)) { + sum_of_squares += lower * lower; + } + if (isfinite(upper)) { + sum_of_squares += upper * upper; + } + } + } + if (params->optimality_norm == NORM_TYPE_L_INF) { + state->constraint_bound_norm = max_val; } else { - constraint_bound_l2_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( - state->constraint_lower_bound, - state->constraint_upper_bound, - n_cons, - constraint_bound_contrib); - CUDA_CHECK(cudaGetLastError()); - double sum_sq = 0.0; - CUBLAS_CHECK(cublasDasum(state->blas_handle, - n_cons, - constraint_bound_contrib, 1, - &sum_sq)); - state->constraint_bound_norm = sqrt(sum_sq); + state->constraint_bound_norm = sqrt(sum_of_squares); } - CUDA_CHECK(cudaFree(constraint_bound_contrib)); state->best_primal_dual_residual_gap = INFINITY; state->last_trial_fixed_point_error = INFINITY; @@ -560,11 +542,11 @@ static pdhg_solver_state_t *initialize_solver_state( } __global__ void build_row_ind(const int *__restrict__ row_ptr, - int n, + int num_rows, int *__restrict__ row_ind) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) + if (i >= num_rows) return; int s = row_ptr[i]; @@ -610,54 +592,6 @@ __global__ void build_transpose_map( A_to_At[k] = pos; } -__global__ void constraint_bound_linf_kernel( - const double* __restrict__ lb, - const double* __restrict__ ub, - int n, - double* __restrict__ out) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - double lower = lb[i]; - double upper = ub[i]; - double v = 0.0; - - if (isfinite(lower) && (lower != upper)) { - v = fabs(lower); - } - if (isfinite(upper)) { - double u = fabs(upper); - if (u > v) v = u; - } - - out[i] = v; -} - -__global__ void constraint_bound_l2_kernel( - const double* __restrict__ lb, - const double* __restrict__ ub, - int n, - double* __restrict__ out) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - double lower = lb[i]; - double upper = ub[i]; - - double acc = 0.0; - - if (isfinite(lower) && (lower != upper)) { - acc += lower * lower; - } - if (isfinite(upper)) { - acc += upper * upper; - } - - out[i] = acc; -} - __global__ void fill_finite_bounds_kernel( const double *__restrict__ lb, const double *__restrict__ ub, From 466cb55e06e3fc245a2f2ea887a6589f309ab885 Mon Sep 17 00:00:00 2001 From: bolucastang Date: Tue, 13 Jan 2026 18:30:41 -0500 Subject: [PATCH 20/20] Bug fixed: unused sv params --- src/solver.cu | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/solver.cu b/src/solver.cu index 7080de7..ce31376 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -894,10 +894,17 @@ static void perform_restart(pdhg_solver_state_t *state, static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params) { - double max_sv = estimate_maximum_singular_value( - state->sparse_handle, state->blas_handle, state->constraint_matrix, - state->constraint_matrix_t, 5000, 1e-4); - state->step_size = 0.998 / max_sv; + if (state->constraint_matrix->num_nonzeros == 0) + { + state->step_size = 1.0; + } + else + { + double max_sv = estimate_maximum_singular_value( + state->sparse_handle, state->blas_handle, state->constraint_matrix, + state->constraint_matrix_t, params->sv_max_iter, params->sv_tol); + state->step_size = 0.998 / max_sv; + } if (params->bound_objective_rescaling) {