ALM for VAWT Simulation Issues and GPU Optimization in FluidX3D

Hello everyone. I’ve spent the last few days trying to simulate a 2D hydrokinetic VAWT using the Actuator Line Model (ALM) and FluidX3D’s FORCE_FIELD, driven by my own aerodynamic coefficient data (XFOIL). The physics implementation is going well (L/D calculation, dynamic stall, etc.), but I’m hitting a huge bottleneck because I have to copy the entire fluid velocity field (lbm.u) from the GPU to the CPU at every step in order to perform interpolation and compute the force on the blade. I was wondering if there is any internal function or utility that would let me sample only the interpolated velocity right at the actuator-line location and keep most of the computation on the GPU. Also, looking ahead to moving this to 3D, keeping the same cell resolution across the whole domain will be unfeasible. Is there any supported or in-development way to implement an adaptive mesh so that high resolution can be concentrated only around the rotor? Any advice on how other users have approached integrating lookup tables into the FluidX3D kernel would be very helpful.

```cpp
#include "setup.hpp"
#include <fstream>
#include <vector>
#include <deque>
#include <stdexcept>
#include <cmath>
#include <iostream>
#include <iomanip>

using namespace std;

// ============================================================================
// ESTRUCTURAS
// ============================================================================
struct BladeState {
    float alpha_filtered = 0.0f;
    float alpha_prev = 0.0f;
    float Un_prev = 0.0f;
    deque<float> da_dt_buffer;
};

// ============================================================================
// INTERPOLACIÓN Y UTILIDADES
// ============================================================================
float3 get_interpolated_velocity_cpu(LBM& lbm, float3 pos) {
    int ix = clamp((int)floor(pos.x), 0, (int)lbm.get_Nx() - 2);
    int iy = clamp((int)floor(pos.y), 0, (int)lbm.get_Ny() - 2);

    float fx = pos.x - ix;
    float fy = pos.y - iy;

    auto get_vec = [&](int x, int y) {
        ulong n = lbm.index(x, y, 0);
        return float3(lbm.u.x[n], lbm.u.y[n], lbm.u.z[n]);
        };

    float3 u00 = get_vec(ix, iy);     float3 u01 = get_vec(ix, iy + 1);
    float3 u10 = get_vec(ix + 1, iy); float3 u11 = get_vec(ix + 1, iy + 1);

    return (u00 * (1.0f - fy) + u01 * fy) * (1.0f - fx) + (u10 * (1.0f - fy) + u11 * fy) * fx;
}

float3 sample_velocity_lineAverage(LBM& lbm, float3 center, float radius, float3 unit_x, float3 unit_y, int N_points = 80) {
    float3 v_avg(0.0f);
    int valid = 0;
    for (int i = 0; i < N_points; i++) {
        float theta = 2.0f * pi * i / N_points; 
        float3 sample_pos = center + radius * (cos(theta) * unit_x + sin(theta) * unit_y);

        if (sample_pos.x >= 0 && sample_pos.x < lbm.get_Nx() && sample_pos.y >= 0 && sample_pos.y < lbm.get_Ny()) {
            v_avg += get_interpolated_velocity_cpu(lbm, sample_pos);
            valid++;
        }
    }
    return valid > 0 ? v_avg / (float)valid : float3(0.0f);
}

void smear_force(LBM& lbm, const float3& pos, const float3& f_fluid, float epsilon_lu) {
    int range = (int)ceilf(3.0f * epsilon_lu);
    float sigma_sq = epsilon_lu * epsilon_lu;
    float gauss_norm = 1.0f / (2.0f * pi * sigma_sq); 
    int zz = 0; 

    int min_x = max(0, (int)floor(pos.x) - range);
    int max_x = min((int)lbm.get_Nx() - 1, (int)floor(pos.x) + range);
    int min_y = max(0, (int)floor(pos.y) - range);
    int max_y = min((int)lbm.get_Ny() - 1, (int)floor(pos.y) + range);

    for (int xx = min_x; xx <= max_x; ++xx) {
        for (int yy = min_y; yy <= max_y; ++yy) {
            float3 dr = float3(xx + 0.5f, yy + 0.5f, zz + 0.5f) - pos;
            float r2 = dot(dr, dr);

            if (r2 > 9.0f * sigma_sq) continue;

            float g = gauss_norm * expf(-r2 / (2.0f * sigma_sq));
            ulong n = lbm.index(xx, yy, zz);

            lbm.F.x[n] += f_fluid.x * g;
            lbm.F.y[n] += f_fluid.y * g;
        }
    }
}

// ============================================================================
// CARGA DE DATOS 
// ============================================================================
bool read_airfoil_data(const string& filename, vector<float>& alphas, vector<float>& Re_nums,
    vector<vector<float>>& cls, vector<vector<float>>& cds, vector<vector<float>>& cms) {
    ifstream file(filename);
    if (!file) return false;
    string line, token;
    if (!getline(file, line)) return false;
    stringstream ss(line);
    getline(ss, token, ',');
    while (getline(ss, token, ',')) {
        token.erase(token.find_last_not_of(" \n\r\t") + 1);
        if (token == "|" || token.empty()) break;
        alphas.push_back(stof(token));
    }
    while (getline(file, line)) {
        stringstream row(line);
        getline(row, token, ',');
        if (token.empty()) continue;
        try { Re_nums.push_back(stof(token)); }
        catch (...) { continue; }
        vector<float> cl_row, cd_row, cm_row;
        int section = 0;
        while (getline(row, token, ',')) {
            token.erase(token.find_last_not_of(" \n\r\t") + 1);
            if (token == "|") { section++; continue; }
            if (token.empty()) continue;
            float val = stof(token);
            if (section == 0) cl_row.push_back(val);
            else if (section == 1) cd_row.push_back(val);
            else if (section == 2) cm_row.push_back(val);
        }
        cls.push_back(cl_row); cds.push_back(cd_row);
        cms.push_back(cm_row.empty() ? vector<float>(cl_row.size(), 0.0f) : cm_row);
    }
    return !Re_nums.empty();
}

float interpolate_2d(const vector<float>& alphas, const vector<float>& Re_nums,
    const vector<vector<float>>& values, float alpha, float Re) {
    if (alphas.empty()) return 0.0f;
    alpha = clamp(alpha, alphas.front(), alphas.back()); 
    Re = clamp(Re, Re_nums.front(), Re_nums.back());

    auto ait = lower_bound(alphas.begin(), alphas.end(), alpha);
    size_t ai = min((ait == alphas.begin()) ? 0 : (size_t)(ait - alphas.begin() - 1), alphas.size() - 2);
    float afrac = (alpha - alphas[ai]) / (alphas[ai + 1] - alphas[ai]);

    auto rit = lower_bound(Re_nums.begin(), Re_nums.end(), Re);
    size_t ri = min((rit == Re_nums.begin()) ? 0 : (size_t)(rit - Re_nums.begin() - 1), Re_nums.size() - 2);
    float Re_frac = (logf(Re) - logf(Re_nums[ri])) / (logf(Re_nums[ri + 1]) - logf(Re_nums[ri]));

    float v0 = values[ri][ai] + afrac * (values[ri][ai + 1] - values[ri][ai]);
    float v1 = values[ri + 1][ai] + afrac * (values[ri + 1][ai + 1] - values[ri + 1][ai]);
    return v0 + Re_frac * (v1 - v0);
}

// ============================================================================
// MAIN SETUP
// ============================================================================
void main_setup() {
    cout << unitbuf; cerr << unitbuf;
    cout << "=== INICIALIZANDO PARAMETROS ===" << endl;

    // --- PARÁMETROS FÍSICOS ---
    const float D_si = 1.0f;
    const float H_si = 1.0f;
    const float chord_si = 0.14f;
    const float U_inf = 1.0f;
    const float rpm = 36.3f;
    const int N_blades = 3;

    const float chord_pivot_ratio = 0.5f;
    const float AC_ratio = 0.25f;
    const float d_AC_BSC_si = (chord_pivot_ratio - AC_ratio) * chord_si;

    const float si_nu = 1.0e-6f;
    const float si_rho = 998.0f;
    const float R_si = D_si / 2.0f;
    const float si_omega = 2.0f * pi * (rpm / 60.0f); 
    const float si_period = 2.0f * pi / si_omega;

    // --- MALLA ---
    const float domain_x_si = 35.0f * D_si;
    const float domain_y_si = 20.0f * D_si;
    const uint3 lbm_N = resolution(float3(domain_x_si, domain_y_si, 0.0f), 1500u);

    // --- LBM SETUP ---
    const float lbm_u = 0.1f;
    units.set_m_kg_s((float)lbm_N.x, lbm_u, 1.0f, domain_x_si, U_inf, si_rho);

    const float lbm_nu = units.nu(si_nu);
    const float lbm_omega = units.omega(si_omega);
    const float lbm_R = units.x(R_si);
    const float chord_lbm = units.x(chord_si);
    const ulong lbm_T = units.t(5.0f * si_period);
    const ulong WARMUP_STEPS = 7000;


    LBM lbm(lbm_N, 1u, 1u, 1u, lbm_nu);

    // --- CONDICIONES DE FRONTERA ---
    parallel_for(lbm.get_N(), [&](ulong n) {
        lbm.rho[n] = 1.0f;
        lbm.u.x[n] = lbm_u; lbm.u.y[n] = 0.0f; lbm.u.z[n] = 0.0f;

        uint x, y, z; lbm.coordinates(n, x, y, z);
        if (x == 0u || x == lbm.get_Nx() - 1u || y == 0u || y == lbm.get_Ny() - 1u) {
            lbm.flags[n] = TYPE_E;
        }
        });

    lbm.rho.write_to_device();
    lbm.u.write_to_device();
    lbm.flags.write_to_device();

    // --- CARGA DE DATOS ---
    vector<float> alphas, Re_nums;
    vector<vector<float>> cls, cds, cms;
    if (!read_airfoil_data("airfoil_data.csv", alphas, Re_nums, cls, cds, cms)) return;

    // --- ARCHIVOS DE SALIDA ---
    ofstream torque_file("torque_data.txt");
    torque_file << "# Time\tTorque_Smooth" << endl;

    ofstream blade_file("blade_debug.csv");
    blade_file << "Time_s,Theta_deg,Phi_deg,Alpha_deg,Cl_3d,Cd_3d,Cn,Ct,V_mag,Torque_inst" << endl;

    // --- VARIABLES SIM ---
    vector<BladeState> blades(N_blades);
    const float alpha_smoothing_factor = 0.08f;
    const float3 center = float3(units.x(10.0f * D_si), lbm.center().y, 0.5f);

    float theta = 0.0f;
    deque<float> torque_output_buffer;
    double torque_sum = 0.0;
    long torque_samples = 0;
    const float wind_power_si = 0.5f * si_rho * (D_si * H_si) * pow(U_inf, 3.0f);

    cout << "=== STARTING MAIN LOOP ===" << endl;

    lbm.run(0u);

    while (lbm.get_t() < lbm_T) {
        parallel_for(lbm.get_N(), [&](ulong n) { lbm.F.x[n] = 0.0f; lbm.F.y[n] = 0.0f; });

        lbm.u.read_from_device();
        float torque_lbm_instant = 0.0f;

        for (int b = 0; b < N_blades; ++b) {
            float phi = theta + 2.0f * pi * b / N_blades;
            float phi_norm = fmod(phi, 2.0f * pi);
            if (phi_norm < 0) phi_norm += 2.0f * pi;

            float3 unit_r = float3(cos(phi), sin(phi), 0.0f);
            float3 unit_t = float3(-sin(phi), cos(phi), 0.0f);
            float3 pos_AC = center + lbm_R * unit_r;
            float3 blade_vel = lbm_omega * lbm_R * unit_t;

            float3 u_fluid = sample_velocity_lineAverage(lbm, pos_AC, 0.25f * chord_lbm, unit_t, unit_r, 60);
            float3 rel_vel = u_fluid - blade_vel;
            float V_mag = length(rel_vel); 

            if (V_mag < 0.001f) continue;

            float V_chord = dot(rel_vel, unit_t); 
            float V_normal = dot(rel_vel, unit_r);
            float alpha_rad = atan2(V_normal, -V_chord);

            // Corrección curvatura flujo
            alpha_rad += (si_omega * chord_si) / (4.0f * units.si_u(V_mag));
            float alpha_deg = degrees(alpha_rad); 

            float& alpha_f = blades[b].alpha_filtered;
            if (lbm.get_t() < 10) alpha_f = alpha_deg;
            alpha_f = alpha_f * (1.0f - alpha_smoothing_factor) + alpha_deg * alpha_smoothing_factor;

            float dt_si = units.si_t(1);
            float da_dt = (alpha_f - blades[b].alpha_prev) / dt_si;
            blades[b].alpha_prev = alpha_f;

            blades[b].da_dt_buffer.push_back(da_dt);
            if (blades[b].da_dt_buffer.size() > 10) blades[b].da_dt_buffer.pop_front();
            float da_dt_avg = 0.0f;
            for (float v : blades[b].da_dt_buffer) da_dt_avg += v;
            da_dt_avg /= max(1.0f, (float)blades[b].da_dt_buffer.size());

            float Re_local = units.si_Re(chord_si, units.si_u(V_mag), si_nu);

            // --- DYNAMIC STALL ---
            float alpha_dyn_deg = alpha_f;
            float pitch_term = (chord_si * radians(da_dt_avg)) / (2.0f * units.si_u(V_mag));
            float sqrt_pitch = sqrtf(fabs(pitch_term));
            float ds_shift = ((alpha_f * da_dt_avg > 0) ? 1.0f : 0.0f) * 6.0f * sqrt_pitch;
            alpha_dyn_deg += (da_dt_avg > 0) ? -ds_shift : ds_shift;

            float lookup_a = fabs(alpha_dyn_deg);
            float cl = interpolate_2d(alphas, Re_nums, cls, lookup_a, Re_local);
            float cd = interpolate_2d(alphas, Re_nums, cds, lookup_a, Re_local);
            float cm = interpolate_2d(alphas, Re_nums, cms, lookup_a, Re_local);
            if (alpha_f < 0) { cl = -cl; cm = -cm; }

            // --- ADDED MASS ---
            float da_dt_rad = radians(da_dt_avg);
            float current_alpha_rad = radians(alpha_f);
            float Un_si = units.si_u(V_mag) * sin(current_alpha_rad);
            float dUn_dt = (Un_si - blades[b].Un_prev) / dt_si;
            blades[b].Un_prev = Un_si;

            float V_rel_sq = max(0.0001f, units.si_u(V_mag) * units.si_u(V_mag));
            float C_n_am = -(pi * chord_si * dUn_dt) / (8.0f * V_rel_sq);
            float C_c_am = (pi * chord_si * da_dt_rad * Un_si) / (8.0f * V_rel_sq);

            float sin_a = sin(current_alpha_rad);
            float cos_a = cos(current_alpha_rad);
            cl += C_n_am * cos_a + C_c_am * sin_a;
            cd += C_n_am * sin_a - C_c_am * cos_a;

            // --- 3D Correction ---
            float AR = H_si / chord_si;
            float factor_3d = 1.0f / (1.0f + 2.0f / (AR * 0.9f));
            float cl_3d = cl * factor_3d;
            float cd_3d = cd + (cl_3d * cl_3d) / (pi * AR * 0.9f);

            float dyn_p = 0.5f * V_mag * V_mag * chord_lbm;
            float3 e_d = normalize(rel_vel); 
            float3 e_l = cross(float3(0, 0, 1), e_d); 
            float3 f_blade = dyn_p * (cl_3d * e_l + cd_3d * e_d);

            // Torque usando producto cruz (componente Z)
            float torque_contribution = cross(pos_AC - center, f_blade).z;
            torque_lbm_instant += torque_contribution;

            smear_force(lbm, pos_AC, -f_blade, 0.25f * chord_lbm);

            if (b == 0 && lbm.get_t() >= WARMUP_STEPS && lbm.get_t() % 20 == 0) {
                float Cn = cl_3d * cos_a + cd_3d * sin_a;
                float Ct = cl_3d * sin_a - cd_3d * cos_a;
                blade_file << units.si_t(lbm.get_t()) << ","
                    << degrees(theta) << "," << degrees(phi_norm) << ","
                    << alpha_f << "," << cl_3d << "," << cd_3d << ","
                    << Cn << "," << Ct << "," << units.si_u(V_mag) << ","
                    << units.si_M(torque_contribution) * (H_si / units.si_x(lbm_N.z)) << endl;
            }
        }

        lbm.F.write_to_device();
        lbm.run(1u);

        float theta_old = theta;
        theta = fmod(theta + lbm_omega, 2.0f * pi);
        if (theta_old > pi && theta < pi) torque_samples++; 

        if (lbm.get_t() >= WARMUP_STEPS) {
            float si_torque = units.si_M(torque_lbm_instant) * (H_si / units.si_x(lbm_N.z));
            torque_output_buffer.push_back(si_torque);
            if (torque_output_buffer.size() > 15) torque_output_buffer.pop_front();

            float t_smooth = accumulate(torque_output_buffer.begin(), torque_output_buffer.end(), 0.0f) / torque_output_buffer.size();
            torque_sum += si_torque;
            torque_samples++;

            if (lbm.get_t() % 20 == 0) torque_file << units.si_t(lbm.get_t()) << "\t" << t_smooth << endl;
            if (lbm.get_t() % 100 == 0) {
                double cp = (torque_sum / max(1L, torque_samples)) * si_omega / wind_power_si;
                cout << "Step: " << lbm.get_t() << " | T: " << fixed << setprecision(2) << t_smooth << " | CP: " << setprecision(3) << cp << endl;
            }
        }
        else if (lbm.get_t() % 500 == 0) {
            cout << "WARMUP: " << (100.0f * lbm.get_t() / WARMUP_STEPS) << "%" << endl;
        }
    }

    torque_file.close();
    blade_file.close();
}
```

<img width="776" height="476" alt="Image" src="https://github.com/user-attachments/assets/cef722ce-13bc-4a63-af38-aaeafda3118d" />

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ALM for VAWT Simulation Issues and GPU Optimization in FluidX3D #320

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

ALM for VAWT Simulation Issues and GPU Optimization in FluidX3D #320

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions