From 7512416ff4f610338ae3bbc08eebceb6f4386a60 Mon Sep 17 00:00:00 2001
From: Audrey-777 <3043005175chy@gmail.com>
Date: Sat, 30 May 2026 21:04:22 +0800
Subject: [PATCH 01/12] refactor: prepare MD runner and ML buffers

---
 source/source_esolver/esolver_dp.cpp  | 46 +++++++++++++-----------
 source/source_esolver/esolver_dp.h    |  4 +++
 source/source_esolver/esolver_nep.cpp | 39 +++++++++++----------
 source/source_esolver/esolver_nep.h   |  5 ++-
 source/source_md/run_md.cpp           | 50 +++++++++++++++------------
 5 files changed, 82 insertions(+), 62 deletions(-)
diff --git a/source/source_esolver/esolver_dp.cpp b/source/source_esolver/esolver_dp.cpp
index 879193e668b..7eb2d9e1fab 100644
--- a/source/source_esolver/esolver_dp.cpp
+++ b/source/source_esolver/esolver_dp.cpp
@@ -36,6 +36,10 @@ void ESolver_DP::before_all_runners(UnitCell& ucell, const Input_para& inp)
     dp_potential = 0;
     dp_force.create(ucell.nat, 3);
     dp_virial.create(3, 3);
+    dp_cell.resize(9);
+    dp_coord.resize(3 * ucell.nat);
+    dp_model_force.clear();
+    dp_model_virial.clear();
 
     ModuleIO::CifParser::write(PARAM.globalv.global_out_dir + "STRU.cif", 
                                ucell, 
@@ -59,38 +63,38 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep)
     ModuleBase::TITLE("ESolver_DP", "runner");
     ModuleBase::timer::start("ESolver_DP", "runner");
 
-    std::vector<double> cell(9, 0.0);
-    cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
-    cell[1] = ucell.latvec.e12 * ucell.lat0_angstrom;
-    cell[2] = ucell.latvec.e13 * ucell.lat0_angstrom;
-    cell[3] = ucell.latvec.e21 * ucell.lat0_angstrom;
-    cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
-    cell[5] = ucell.latvec.e23 * ucell.lat0_angstrom;
-    cell[6] = ucell.latvec.e31 * ucell.lat0_angstrom;
-    cell[7] = ucell.latvec.e32 * ucell.lat0_angstrom;
-    cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
-
-    std::vector<double> coord(3 * ucell.nat, 0.0);
+    dp_cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
+    dp_cell[1] = ucell.latvec.e12 * ucell.lat0_angstrom;
+    dp_cell[2] = ucell.latvec.e13 * ucell.lat0_angstrom;
+    dp_cell[3] = ucell.latvec.e21 * ucell.lat0_angstrom;
+    dp_cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
+    dp_cell[5] = ucell.latvec.e23 * ucell.lat0_angstrom;
+    dp_cell[6] = ucell.latvec.e31 * ucell.lat0_angstrom;
+    dp_cell[7] = ucell.latvec.e32 * ucell.lat0_angstrom;
+    dp_cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
+
+    dp_coord.resize(3 * ucell.nat);
     int iat = 0;
     for (int it = 0; it < ucell.ntype; ++it)
     {
         for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
         {
-            coord[3 * iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
-            coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
-            coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
+            dp_coord[3 * iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+            dp_coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+            dp_coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
             iat++;
         }
     }
     assert(ucell.nat == iat);
 
 #ifdef __DPMD
-    std::vector<double> f, v;
     dp_potential = 0;
     dp_force.zero_out();
     dp_virial.zero_out();
+    dp_model_force.clear();
+    dp_model_virial.clear();
 
-    dp.compute(dp_potential, f, v, coord, atype, cell, fparam, aparam);
+    dp.compute(dp_potential, dp_model_force, dp_model_virial, dp_coord, atype, dp_cell, fparam, aparam);
 
     // rescale the energy, force, and stress
     const double fact_e = rescaling / ModuleBase::Ry_to_eV;
@@ -103,16 +107,16 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep)
 
     for (int i = 0; i < ucell.nat; ++i)
     {
-        dp_force(i, 0) = f[3 * i] * fact_f;
-        dp_force(i, 1) = f[3 * i + 1] * fact_f;
-        dp_force(i, 2) = f[3 * i + 2] * fact_f;
+        dp_force(i, 0) = dp_model_force[3 * i] * fact_f;
+        dp_force(i, 1) = dp_model_force[3 * i + 1] * fact_f;
+        dp_force(i, 2) = dp_model_force[3 * i + 2] * fact_f;
     }
 
     for (int i = 0; i < 3; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
-            dp_virial(i, j) = v[3 * i + j] * fact_v;
+            dp_virial(i, j) = dp_model_virial[3 * i + j] * fact_v;
         }
     }
 #else
diff --git a/source/source_esolver/esolver_dp.h b/source/source_esolver/esolver_dp.h
index 405bae44461..52ddd869911 100644
--- a/source/source_esolver/esolver_dp.h
+++ b/source/source_esolver/esolver_dp.h
@@ -115,6 +115,10 @@ class ESolver_DP : public ESolver
     double dp_potential = 0.0;       ///< computed potential energy
     ModuleBase::matrix dp_force;     ///< computed atomic forces
     ModuleBase::matrix dp_virial;    ///< computed lattice virials
+    std::vector<double> dp_cell;     ///< DP cell buffer in Angstrom
+    std::vector<double> dp_coord;    ///< DP coordinate buffer in Angstrom
+    std::vector<double> dp_model_force;  ///< raw force buffer returned by DP
+    std::vector<double> dp_model_virial; ///< raw virial buffer returned by DP
 };
 
 } // namespace ModuleESolver
diff --git a/source/source_esolver/esolver_nep.cpp b/source/source_esolver/esolver_nep.cpp
index 8944776aaa6..757fb6bd8fc 100644
--- a/source/source_esolver/esolver_nep.cpp
+++ b/source/source_esolver/esolver_nep.cpp
@@ -23,6 +23,7 @@
 #include "source_io/module_output/output_log.h"
 #include "source_io/module_output/cif_io.h"
 
+#include <algorithm>
 #include <numeric>
 #include <unordered_map>
 
@@ -34,6 +35,9 @@ void ESolver_NEP::before_all_runners(UnitCell& ucell, const Input_para& inp)
     nep_force.create(ucell.nat, 3);
     nep_virial.create(3, 3);
     atype.resize(ucell.nat);
+    nep_cell.resize(9);
+    nep_coord.resize(3 * ucell.nat);
+    nep_virial_sum.resize(9);
     _e.resize(ucell.nat);
     _f.resize(3 * ucell.nat);
     _v.resize(9 * ucell.nat);
@@ -56,28 +60,27 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
 
     // note that NEP are column major, thus a transpose is needed
     // cell
-    std::vector<double> cell(9, 0.0);
-    cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
-    cell[1] = ucell.latvec.e21 * ucell.lat0_angstrom;
-    cell[2] = ucell.latvec.e31 * ucell.lat0_angstrom;
-    cell[3] = ucell.latvec.e12 * ucell.lat0_angstrom;
-    cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
-    cell[5] = ucell.latvec.e32 * ucell.lat0_angstrom;
-    cell[6] = ucell.latvec.e13 * ucell.lat0_angstrom;
-    cell[7] = ucell.latvec.e23 * ucell.lat0_angstrom;
-    cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
+    nep_cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
+    nep_cell[1] = ucell.latvec.e21 * ucell.lat0_angstrom;
+    nep_cell[2] = ucell.latvec.e31 * ucell.lat0_angstrom;
+    nep_cell[3] = ucell.latvec.e12 * ucell.lat0_angstrom;
+    nep_cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
+    nep_cell[5] = ucell.latvec.e32 * ucell.lat0_angstrom;
+    nep_cell[6] = ucell.latvec.e13 * ucell.lat0_angstrom;
+    nep_cell[7] = ucell.latvec.e23 * ucell.lat0_angstrom;
+    nep_cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
 
     // coord
-    std::vector<double> coord(3 * ucell.nat, 0.0);
+    nep_coord.resize(3 * ucell.nat);
     int iat = 0;
     const int nat = ucell.nat;
     for (int it = 0; it < ucell.ntype; ++it)
     {
         for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
         {
-            coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
-            coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
-            coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
+            nep_coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+            nep_coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+            nep_coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
             iat++;
         }
     }
@@ -88,7 +91,7 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
     nep_force.zero_out();
     nep_virial.zero_out();
 
-    nep.compute(atype, cell, coord, _e, _f, _v);
+    nep.compute(atype, nep_cell, nep_coord, _e, _f, _v);
 
     // unit conversion
     const double fact_e = 1.0 / ModuleBase::Ry_to_eV;
@@ -110,13 +113,13 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
     }
 
     // virial
-    std::vector<double> v_sum(9, 0.0);
+    std::fill(nep_virial_sum.begin(), nep_virial_sum.end(), 0.0);
     for (int j = 0; j < 9; ++j)
     {
         for (int i = 0; i < nat; ++i)
         {
             int index = j * nat + i;
-            v_sum[j] += _v[index];
+            nep_virial_sum[j] += _v[index];
         }
     }
 
@@ -125,7 +128,7 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
     {
         for (int j = 0; j < 3; ++j)
         {
-            nep_virial(i, j) = v_sum[3 * i + j] * fact_v;
+            nep_virial(i, j) = nep_virial_sum[3 * i + j] * fact_v;
         }
     }
 #else
diff --git a/source/source_esolver/esolver_nep.h b/source/source_esolver/esolver_nep.h
index dfec17a83c2..24d14b2d3ee 100644
--- a/source/source_esolver/esolver_nep.h
+++ b/source/source_esolver/esolver_nep.h
@@ -98,6 +98,9 @@ class ESolver_NEP : public ESolver
     double nep_potential;                ///< computed potential energy
     ModuleBase::matrix nep_force;        ///< computed atomic forces
     ModuleBase::matrix nep_virial;       ///< computed lattice virials
+    std::vector<double> nep_cell;        ///< NEP cell buffer in Angstrom, column-major
+    std::vector<double> nep_coord;       ///< NEP coordinate buffer in Angstrom, column-major
+    std::vector<double> nep_virial_sum;  ///< summed per-atom virial components
     std::vector<double> _e;              ///< temporary storage for energy computation
     std::vector<double> _f;              ///< temporary storage for force computation
     std::vector<double> _v;              ///< temporary storage for virial computation
@@ -105,4 +108,4 @@ class ESolver_NEP : public ESolver
 
 } // namespace ModuleESolver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/source_md/run_md.cpp b/source/source_md/run_md.cpp
index ef28e5c8975..b7aab6511a0 100644
--- a/source/source_md/run_md.cpp
+++ b/source/source_md/run_md.cpp
@@ -12,41 +12,48 @@
 #include "verlet.h"
 #include "source_cell/update_cell.h"
 #include "source_cell/print_cell.h"
-namespace Run_MD
-{
+#include <memory>
 
-void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Parameter& param_in)
+namespace
+{
+std::unique_ptr<MD_base> create_md_runner(const Parameter& param_in, UnitCell& unit_in)
 {
-    ModuleBase::TITLE("Run_MD", "md_line");
-    ModuleBase::timer::start("Run_MD", "md_line");
-
-    /// determine the md_type
-    MD_base* mdrun = nullptr;
     if (param_in.mdp.md_type == "fire")
     {
-        mdrun = new FIRE(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new FIRE(param_in, unit_in));
     }
-    else if ((param_in.mdp.md_type == "nvt" && param_in.mdp.md_thermostat == "nhc") || param_in.mdp.md_type == "npt")
+    if ((param_in.mdp.md_type == "nvt" && param_in.mdp.md_thermostat == "nhc") || param_in.mdp.md_type == "npt")
     {
-        mdrun = new Nose_Hoover(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new Nose_Hoover(param_in, unit_in));
     }
-    else if (param_in.mdp.md_type == "nve" || param_in.mdp.md_type == "nvt")
+    if (param_in.mdp.md_type == "nve" || param_in.mdp.md_type == "nvt")
     {
-        mdrun = new Verlet(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new Verlet(param_in, unit_in));
     }
-    else if (param_in.mdp.md_type == "langevin")
+    if (param_in.mdp.md_type == "langevin")
     {
-        mdrun = new Langevin(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new Langevin(param_in, unit_in));
     }
-    else if (param_in.mdp.md_type == "msst")
+    if (param_in.mdp.md_type == "msst")
     {
-        mdrun = new MSST(param_in, unit_in);
-    }
-    else
-    {
-        ModuleBase::WARNING_QUIT("md_line", "no such md_type!");
+        return std::unique_ptr<MD_base>(new MSST(param_in, unit_in));
     }
 
+    ModuleBase::WARNING_QUIT("md_line", "no such md_type!");
+    return nullptr;
+}
+} // namespace
+
+namespace Run_MD
+{
+
+void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Parameter& param_in)
+{
+    ModuleBase::TITLE("Run_MD", "md_line");
+    ModuleBase::timer::start("Run_MD", "md_line");
+
+    std::unique_ptr<MD_base> mdrun = create_md_runner(param_in, unit_in);
+
     /// md cycle, mohan update 2026-01-04, change '<=' to '<'
     while ((mdrun->step_ + mdrun->step_rst_) < param_in.mdp.md_nstep && !mdrun->stop)
     {
@@ -129,7 +136,6 @@ void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Paramet
         mdrun->step_++;
     }
 
-    delete mdrun;
     ModuleBase::timer::end("Run_MD", "md_line");
     return;
 }

From 1a608fd06072900739319e4121d1b75c71a67506 Mon Sep 17 00:00:00 2001
From: Audrey-777 <3043005175chy@gmail.com>
Date: Sat, 30 May 2026 21:04:47 +0800
Subject: [PATCH 02/12] refactor: extract MD statistics state helpers

---
 source/source_md/md_func.cpp     | 54 ++++++++++++++++++++++++--------
 source/source_md/md_func.h       | 18 +++++++++++
 source/source_md/md_statistics.h | 23 ++++++++++++++
 3 files changed, 82 insertions(+), 13 deletions(-)
 create mode 100644 source/source_md/md_statistics.h

diff --git a/source/source_md/md_func.cpp b/source/source_md/md_func.cpp
index 6bd1b60dd59..5089b1d8d64 100644
--- a/source/source_md/md_func.cpp
+++ b/source/source_md/md_func.cpp
@@ -52,6 +52,43 @@ double kinetic_energy(const int& natom, const ModuleBase::Vector3<double>* vel,
     return ke;
 }
 
+MDKineticState calc_kinetic_state(const int& natom,
+                                  const int& frozen_freedom,
+                                  const double* allmass,
+                                  const ModuleBase::Vector3<double>* vel)
+{
+    MDKineticState state;
+    if (3 * natom == frozen_freedom)
+    {
+        return state;
+    }
+
+    state.kinetic = kinetic_energy(natom, vel, allmass);
+    state.temperature = 2 * state.kinetic / (3 * natom - frozen_freedom);
+    return state;
+}
+
+MDStressState calc_stress_state(const int& natom,
+                                const double& omega,
+                                const ModuleBase::Vector3<double>* vel,
+                                const double* allmass,
+                                const ModuleBase::matrix& virial)
+{
+    MDStressState state;
+    temp_vector(natom, vel, allmass, state.temperature_tensor);
+    state.stress.create(3, 3);
+
+    for (int i = 0; i < 3; ++i)
+    {
+        for (int j = 0; j < 3; ++j)
+        {
+            state.stress(i, j) = virial(i, j) + state.temperature_tensor(i, j) / omega;
+        }
+    }
+
+    return state;
+}
+
 void compute_stress(const UnitCell& unit_in,
                     const ModuleBase::Vector3<double>* vel,
                     const double* allmass,
@@ -61,17 +98,7 @@ void compute_stress(const UnitCell& unit_in,
 {
     if (cal_stress)
     {
-        ModuleBase::matrix t_vector;
-
-        temp_vector(unit_in.nat, vel, allmass, t_vector);
-
-        for (int i = 0; i < 3; ++i)
-        {
-            for (int j = 0; j < 3; ++j)
-            {
-                stress(i, j) = virial(i, j) + t_vector(i, j) / unit_in.omega;
-            }
-        }
+        stress = calc_stress_state(unit_in.nat, unit_in.omega, vel, allmass, virial).stress;
     }
 
     return;
@@ -463,8 +490,9 @@ double current_temp(double& kinetic,
     }
     else
     {
-        kinetic = kinetic_energy(natom, vel, allmass);
-        return 2 * kinetic / (3 * natom - frozen_freedom);
+        const MDKineticState state = calc_kinetic_state(natom, frozen_freedom, allmass, vel);
+        kinetic = state.kinetic;
+        return state.temperature;
     }
 }
 
diff --git a/source/source_md/md_func.h b/source/source_md/md_func.h
index be433ffe4ac..51c4eb47d83 100644
--- a/source/source_md/md_func.h
+++ b/source/source_md/md_func.h
@@ -1,6 +1,7 @@
 #ifndef MD_FUNC_H
 #define MD_FUNC_H
 
+#include "md_statistics.h"
 #include "source_esolver/esolver.h"
 
 class Parameter;
@@ -117,6 +118,14 @@ void force_virial(ModuleESolver::ESolver* p_esolver,
  */
 double kinetic_energy(const int& natom, const ModuleBase::Vector3<double>* vel, const double* allmass);
 
+/**
+ * @brief calculate kinetic energy and temperature without writing caller-owned state
+ */
+MDKineticState calc_kinetic_state(const int& natom,
+                                  const int& frozen_freedom,
+                                  const double* allmass,
+                                  const ModuleBase::Vector3<double>* vel);
+
 /**
  * @brief calculate the total stress tensor
  *
@@ -134,6 +143,15 @@ void compute_stress(const UnitCell& unit_in,
                     const ModuleBase::matrix& virial,
                     ModuleBase::matrix& stress);
 
+/**
+ * @brief calculate stress and ionic temperature tensor without writing caller-owned state
+ */
+MDStressState calc_stress_state(const int& natom,
+                                const double& omega,
+                                const ModuleBase::Vector3<double>* vel,
+                                const double* allmass,
+                                const ModuleBase::matrix& virial);
+
 /**
  * @brief output the stress information
  *
diff --git a/source/source_md/md_statistics.h b/source/source_md/md_statistics.h
new file mode 100644
index 00000000000..e7bef175be5
--- /dev/null
+++ b/source/source_md/md_statistics.h
@@ -0,0 +1,23 @@
+#ifndef MD_STATISTICS_H
+#define MD_STATISTICS_H
+
+#include "source_base/matrix.h"
+
+namespace MD_func
+{
+
+struct MDKineticState
+{
+    double kinetic = 0.0;
+    double temperature = 0.0;
+};
+
+struct MDStressState
+{
+    ModuleBase::matrix stress;
+    ModuleBase::matrix temperature_tensor;
+};
+
+} // namespace MD_func
+
+#endif // MD_STATISTICS_H

From a0995059e7cb6b2eb1f849692e4c51d5fc2885bf Mon Sep 17 00:00:00 2001
From: Audrey-777 <3043005175chy@gmail.com>
Date: Sat, 30 May 2026 21:05:04 +0800
Subject: [PATCH 03/12] refactor: introduce MD test fixtures

---
 source/source_md/test/CMakeLists.txt    |   1 -
 source/source_md/test/fire_test.cpp     |  30 +------
 source/source_md/test/langevin_test.cpp |  28 +-----
 source/source_md/test/lj_pot_test.cpp   |  47 +++-------
 source/source_md/test/md_func_test.cpp  |  42 +--------
 source/source_md/test/md_test_fixture.h | 110 ++++++++++++++++++++++++
 source/source_md/test/msst_test.cpp     |  30 +------
 source/source_md/test/nhchain_test.cpp  |  31 ++-----
 source/source_md/test/verlet_test.cpp   |  27 +-----
 9 files changed, 142 insertions(+), 204 deletions(-)
 create mode 100644 source/source_md/test/md_test_fixture.h

diff --git a/source/source_md/test/CMakeLists.txt b/source/source_md/test/CMakeLists.txt
index c4e3a1c2d2f..682bd682b49 100644
--- a/source/source_md/test/CMakeLists.txt
+++ b/source/source_md/test/CMakeLists.txt
@@ -37,7 +37,6 @@ list(APPEND depend_files
   ../../source_base/realarray.cpp
   ../../source_base/complexarray.cpp
   ../../source_base/complexmatrix.cpp
-  ../../source_base/global_variable.cpp
   ../../source_base/libm/branred.cpp
   ../../source_base/libm/sincos.cpp
   ../../source_base/math_integral.cpp
diff --git a/source/source_md/test/fire_test.cpp b/source/source_md/test/fire_test.cpp
index 3b294da46ac..e52ae4cbecd 100644
--- a/source/source_md/test/fire_test.cpp
+++ b/source/source_md/test/fire_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/fire.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 /************************************************
@@ -35,31 +34,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class FIREtest : public testing::Test
+class FIREtest : public MdIntegratorFixture<FIRE>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new FIRE(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
-    }
 };
 
 TEST_F(FIREtest, Setup)
@@ -167,7 +143,7 @@ TEST_F(FIREtest, Restart)
     mdrun->restart(PARAM.sys.global_readin_dir);
     remove("Restart_md.txt");
 
-    FIRE* fire = dynamic_cast<FIRE*>(mdrun);
+    FIRE* fire = dynamic_cast<FIRE*>(mdrun.get());
     EXPECT_EQ(mdrun->step_rst_, 3);
     EXPECT_EQ(fire->alpha, 0.1);
     EXPECT_EQ(fire->negative_count, 0);
diff --git a/source/source_md/test/langevin_test.cpp b/source/source_md/test/langevin_test.cpp
index 69df605b153..65d462a86da 100644
--- a/source/source_md/test/langevin_test.cpp
+++ b/source/source_md/test/langevin_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/langevin.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 /************************************************
@@ -35,31 +34,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class Langevin_test : public testing::Test
+class Langevin_test : public MdIntegratorFixture<Langevin>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new Langevin(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
-    }
 };
 
 TEST_F(Langevin_test, setup)
diff --git a/source/source_md/test/lj_pot_test.cpp b/source/source_md/test/lj_pot_test.cpp
index 64c0b52fe6b..99cec432b56 100644
--- a/source/source_md/test/lj_pot_test.cpp
+++ b/source/source_md/test/lj_pot_test.cpp
@@ -1,9 +1,9 @@
 #include "gtest/gtest.h"
 #define private public
 #include "source_io/module_parameter/parameter.h"
+#include "md_test_fixture.h"
 #include "source_esolver/esolver_lj.h"
 #include "source_md/md_func.h"
-#include "setcell.h"
 #undef private
 #define doublethreshold 1e-12
 
@@ -17,46 +17,23 @@
  *     - calculate energy, force, virial for lj pot
  */
 
-class LJ_pot_test : public testing::Test
+class LJ_pot_test : public LjPotTestFixture
 {
-  protected:
-    ModuleBase::Vector3<double>* force;
-    ModuleBase::matrix stress;
-    double potential;
-    int natom;
-    UnitCell ucell;
-    Input_para input;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-
-        natom = ucell.nat;
-        force = new ModuleBase::Vector3<double>[natom];
-        stress.create(3, 3);
-
-        Setcell::parameters(input);
-    }
-
-    void TearDown()
-    {
-        delete[] force;
-    }
 };
 
 TEST_F(LJ_pot_test, potential)
 {
-    ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver(new ModuleESolver::ESolver_LJ());
     p_esolver->before_all_runners(ucell, input);
-    MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress);
+    MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress);
     EXPECT_NEAR(potential, -0.011957818623534381, doublethreshold);
 }
 
 TEST_F(LJ_pot_test, force)
 {
-    ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver(new ModuleESolver::ESolver_LJ());
     p_esolver->before_all_runners(ucell, input);
-    MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress);
+    MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress);
     EXPECT_NEAR(force[0].x, 0.00049817733089377704, doublethreshold);
     EXPECT_NEAR(force[0].y, 0.00082237246837022328, doublethreshold);
     EXPECT_NEAR(force[0].z, -3.0493186101154812e-20, doublethreshold);
@@ -73,9 +50,9 @@ TEST_F(LJ_pot_test, force)
 
 TEST_F(LJ_pot_test, stress)
 {
-    ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver(new ModuleESolver::ESolver_LJ());
     p_esolver->before_all_runners(ucell, input);
-    MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress);
+    MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress);
     EXPECT_NEAR(stress(0, 0), 8.0360222227631859e-07, doublethreshold);
     EXPECT_NEAR(stress(0, 1), 1.7207745586539077e-07, doublethreshold);
     EXPECT_NEAR(stress(0, 2), 0, doublethreshold);
@@ -89,7 +66,7 @@ TEST_F(LJ_pot_test, stress)
 
 TEST_F(LJ_pot_test, RcutSearchRadius)
 {
-    ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver_LJ> p_esolver(new ModuleESolver::ESolver_LJ());
     ucell.ntype = 2;
     std::vector<double> rcut = {3.0};
     p_esolver->rcut_search_radius(ucell.ntype, rcut);
@@ -114,7 +91,7 @@ TEST_F(LJ_pot_test, RcutSearchRadius)
 
 TEST_F(LJ_pot_test, SetC6C12)
 {
-    ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver_LJ> p_esolver(new ModuleESolver::ESolver_LJ());
     ucell.ntype = 2;
 
     // no rule
@@ -187,7 +164,7 @@ TEST_F(LJ_pot_test, SetC6C12)
 
 TEST_F(LJ_pot_test, CalEnShift)
 {
-    ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver_LJ> p_esolver(new ModuleESolver::ESolver_LJ());
     ucell.ntype = 2;
 
     std::vector<double> rcut = {3.0};
@@ -214,4 +191,4 @@ TEST_F(LJ_pot_test, CalEnShift)
     EXPECT_NEAR(p_esolver->en_shift(0, 1), -3.303688865319793e-07, doublethreshold);
     EXPECT_NEAR(p_esolver->en_shift(1, 0), -3.303688865319793e-07, doublethreshold);
     EXPECT_NEAR(p_esolver->en_shift(1, 1), -5.6443326024140752e-06, doublethreshold);
-}
\ No newline at end of file
+}
diff --git a/source/source_md/test/md_func_test.cpp b/source/source_md/test/md_func_test.cpp
index eb9ce57a5f9..a9bae026fa8 100644
--- a/source/source_md/test/md_func_test.cpp
+++ b/source/source_md/test/md_func_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
+#include "md_test_fixture.h"
 #include "source_md/md_func.h"
-#include "setcell.h"
 
 #define doublethreshold 1e-12
 /************************************************
@@ -50,45 +49,8 @@
  *     - test the current_md_info function with an incorrect file path
  */
 
-class MD_func_test : public testing::Test
+class MD_func_test : public MdFuncTestFixture
 {
-  protected:
-    UnitCell ucell;
-    double* allmass;                    // atom mass
-    ModuleBase::Vector3<double>* pos;   // atom position
-    ModuleBase::Vector3<double>* vel;   // atom velocity
-    ModuleBase::Vector3<int>* ionmbl;   // atom is frozen or not
-    ModuleBase::Vector3<double>* force; // atom force
-    ModuleBase::matrix virial;          // virial for this lattice
-    ModuleBase::matrix stress;          // stress for this lattice
-    double potential;                   // potential energy
-    int natom;                          // atom number
-    double temperature;                 // temperature
-    int frozen_freedom;                 // frozen_freedom
-    Parameter param_in;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-        natom = ucell.nat;
-        allmass = new double[natom];
-        pos = new ModuleBase::Vector3<double>[natom];
-        ionmbl = new ModuleBase::Vector3<int>[natom];
-        vel = new ModuleBase::Vector3<double>[natom];
-        force = new ModuleBase::Vector3<double>[natom];
-        stress.create(3, 3);
-        virial.create(3, 3);
-    }
-
-    void TearDown()
-    {
-        delete[] allmass;
-        delete[] pos;
-        delete[] vel;
-        delete[] ionmbl;
-        delete[] force;
-    }
 };
 
 TEST_F(MD_func_test, gaussrand)
diff --git a/source/source_md/test/md_test_fixture.h b/source/source_md/test/md_test_fixture.h
new file mode 100644
index 00000000000..3ad80c23638
--- /dev/null
+++ b/source/source_md/test/md_test_fixture.h
@@ -0,0 +1,110 @@
+#ifndef MD_TEST_FIXTURE_H
+#define MD_TEST_FIXTURE_H
+
+#include "gtest/gtest.h"
+#include "source_esolver/esolver_lj.h"
+#include "source_md/md_base.h"
+#include "setcell.h"
+
+#include <memory>
+#include <vector>
+
+class MdTestBase : public testing::Test
+{
+  protected:
+    UnitCell ucell;
+    Parameter param_in;
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver;
+
+    void SetUp() override
+    {
+        Setcell::setupcell(ucell);
+        Setcell::parameters(param_in.input);
+
+        p_esolver.reset(new ModuleESolver::ESolver_LJ());
+        p_esolver->before_all_runners(ucell, param_in.inp);
+    }
+};
+
+template <class Integrator>
+class MdIntegratorFixture : public MdTestBase
+{
+  protected:
+    std::unique_ptr<MD_base> mdrun;
+
+    void SetUp() override
+    {
+        MdTestBase::SetUp();
+        mdrun.reset(new Integrator(param_in, ucell));
+        mdrun->setup(p_esolver.get(), PARAM.sys.global_readin_dir);
+    }
+};
+
+class MdFuncTestFixture : public testing::Test
+{
+  protected:
+    UnitCell ucell;
+    std::vector<double> allmass_store;
+    std::vector<ModuleBase::Vector3<double>> pos_store;
+    std::vector<ModuleBase::Vector3<double>> vel_store;
+    std::vector<ModuleBase::Vector3<int>> ionmbl_store;
+    std::vector<ModuleBase::Vector3<double>> force_store;
+    double* allmass = nullptr;
+    ModuleBase::Vector3<double>* pos = nullptr;
+    ModuleBase::Vector3<double>* vel = nullptr;
+    ModuleBase::Vector3<int>* ionmbl = nullptr;
+    ModuleBase::Vector3<double>* force = nullptr;
+    ModuleBase::matrix virial;
+    ModuleBase::matrix stress;
+    double potential = 0.0;
+    int natom = 0;
+    double temperature = 0.0;
+    int frozen_freedom = 0;
+    Parameter param_in;
+
+    void SetUp() override
+    {
+        Setcell::setupcell(ucell);
+        Setcell::parameters(param_in.input);
+        natom = ucell.nat;
+
+        allmass_store.resize(natom);
+        pos_store.resize(natom);
+        vel_store.resize(natom);
+        ionmbl_store.resize(natom);
+        force_store.resize(natom);
+        allmass = allmass_store.data();
+        pos = pos_store.data();
+        vel = vel_store.data();
+        ionmbl = ionmbl_store.data();
+        force = force_store.data();
+        stress.create(3, 3);
+        virial.create(3, 3);
+    }
+};
+
+class LjPotTestFixture : public testing::Test
+{
+  protected:
+    std::vector<ModuleBase::Vector3<double>> force_store;
+    ModuleBase::Vector3<double>* force = nullptr;
+    ModuleBase::matrix stress;
+    double potential = 0.0;
+    int natom = 0;
+    UnitCell ucell;
+    Input_para input;
+
+    void SetUp() override
+    {
+        Setcell::setupcell(ucell);
+
+        natom = ucell.nat;
+        force_store.resize(natom);
+        force = force_store.data();
+        stress.create(3, 3);
+
+        Setcell::parameters(input);
+    }
+};
+
+#endif // MD_TEST_FIXTURE_H
diff --git a/source/source_md/test/msst_test.cpp b/source/source_md/test/msst_test.cpp
index 7d0fd8054d1..8b3982be73b 100644
--- a/source/source_md/test/msst_test.cpp
+++ b/source/source_md/test/msst_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/msst.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 /************************************************
@@ -35,31 +34,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class MSST_test : public testing::Test
+class MSST_test : public MdIntegratorFixture<MSST>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new MSST(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
-    }
 };
 
 TEST_F(MSST_test, setup)
@@ -208,7 +184,7 @@ TEST_F(MSST_test, restart)
     mdrun->restart(PARAM.sys.global_readin_dir);
     remove("Restart_md.txt");
 
-    MSST* msst = dynamic_cast<MSST*>(mdrun);
+    MSST* msst = dynamic_cast<MSST*>(mdrun.get());
     EXPECT_EQ(mdrun->step_rst_, 3);
     EXPECT_EQ(msst->omega[mdrun->mdp.msst_direction], -0.00977662);
     EXPECT_EQ(msst->e0, -0.00768262);
diff --git a/source/source_md/test/nhchain_test.cpp b/source/source_md/test/nhchain_test.cpp
index 647df0a730d..4f7a4244011 100644
--- a/source/source_md/test/nhchain_test.cpp
+++ b/source/source_md/test/nhchain_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/nhchain.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 /************************************************
  *  unit test of functions in nhchain.h
@@ -33,34 +32,20 @@
  *   - Nose_Hoover::print_md
  *     - output MD information such as energy, temperature, and pressure
  */
-class NHC_test : public testing::Test
+class NHC_test : public MdTestBase
 {
   protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
+    std::unique_ptr<MD_base> mdrun;
 
-    void SetUp()
+    void SetUp() override
     {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
+        MdTestBase::SetUp();
         param_in.input.mdp.md_type = "npt";
         param_in.input.mdp.md_pmode = "tri";
         param_in.input.mdp.md_pfirst = 1;
         param_in.input.mdp.md_plast = 1;
-        mdrun = new Nose_Hoover(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
+        mdrun.reset(new Nose_Hoover(param_in, ucell));
+        mdrun->setup(p_esolver.get(), PARAM.sys.global_readin_dir);
     }
 };
 
@@ -179,7 +164,7 @@ TEST_F(NHC_test, restart)
     mdrun->restart(PARAM.sys.global_readin_dir);
     remove("Restart_md.txt");
 
-    Nose_Hoover* nhc = dynamic_cast<Nose_Hoover*>(mdrun);
+    Nose_Hoover* nhc = dynamic_cast<Nose_Hoover*>(mdrun.get());
     EXPECT_EQ(mdrun->step_rst_, 3);
     EXPECT_EQ(mdrun->mdp.md_tchain, 4);
     EXPECT_EQ(mdrun->mdp.md_pchain, 4);
diff --git a/source/source_md/test/verlet_test.cpp b/source/source_md/test/verlet_test.cpp
index 8f2c00f74f3..b16cd522756 100644
--- a/source/source_md/test/verlet_test.cpp
+++ b/source/source_md/test/verlet_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/verlet.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 
@@ -36,30 +35,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class Verlet_test : public testing::Test
+class Verlet_test : public MdIntegratorFixture<Verlet>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new Verlet(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-    }
 };
 
 TEST_F(Verlet_test, setup)

From 6b545c7f9426b21cf0cadcf34d444daea0737902 Mon Sep 17 00:00:00 2001
From: Audrey-777 <3043005175chy@gmail.com>
Date: Sat, 30 May 2026 21:05:32 +0800
Subject: [PATCH 04/12] docs: record MD pre-parallel refactor

---
 Results/0530_md_pre_parallel_refactor.md | 70 ++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 Results/0530_md_pre_parallel_refactor.md

diff --git a/Results/0530_md_pre_parallel_refactor.md b/Results/0530_md_pre_parallel_refactor.md
new file mode 100644
index 00000000000..08562dcae6e
--- /dev/null
+++ b/Results/0530_md_pre_parallel_refactor.md
@@ -0,0 +1,70 @@
+# ABACUS MD 并行优化前置重构记录
+
+## 目标边界
+
+本轮重构只调整代码结构，为后续正式 OpenMP/CUDA 并行优化降低改动风险；不引入并行代码，不改变 MD 数值算法、输入输出格式和执行流程。
+
+## 重构点清单
+
+| 编号 | 重构点 | 涉及文件 | 原因 | 行为影响 |
+| --- | --- | --- | --- | --- |
+| R1 | 抽取 MD 积分器创建逻辑 | `source/source_md/run_md.cpp` | 将 `md_type` 到具体积分器类的选择从 `md_line()` 主循环中分离，减少后续修改主循环时的生命周期风险。 | 使用 `std::unique_ptr<MD_base>` 替代手动 `new/delete`，积分器选择条件和 MD 主循环顺序不变。 |
+| R2 | 复用 DP 接口层临时缓冲区 | `source/source_esolver/esolver_dp.h`、`source/source_esolver/esolver_dp.cpp` | DP runner 每步重复创建 `cell`、`coord`、`f`、`v` 容器；将其改为对象成员，为后续数据布局和接口层并行优化做准备。 | 缓冲区生命周期从函数局部变为 `ESolver_DP` 成员；每步仍按原顺序填充坐标、调用 `dp.compute()`、回填能量/力/Virial。 |
+| R3 | 复用 NEP 接口层临时缓冲区 | `source/source_esolver/esolver_nep.h`、`source/source_esolver/esolver_nep.cpp` | NEP runner 每步重复创建 `cell`、`coord`、`v_sum`；成员缓存便于后续对坐标转换、力回填和 Virial 求和做并行化。 | 缓冲区生命周期从函数局部变为 `ESolver_NEP` 成员；NEP column-major 数据布局和数值转换公式不变。 |
+| R4 | 拆出 MD 统计纯计算状态 | `source/source_md/md_statistics.h`、`source/source_md/md_func.h`、`source/source_md/md_func.cpp` | 动能、温度、应力后续会涉及 reduction 和线程一致性验证；先拆出无副作用的结果结构，保留旧接口作为包装器。 | 新增 `MDKineticState`、`MDStressState`、`calc_kinetic_state()`、`calc_stress_state()`；`current_temp()` 和 `compute_stress()` 的外部接口和返回结果不变。 |
+| R5 | 建立 MD 测试夹具 | `source/source_md/test/md_test_fixture.h` 及 MD 测试文件 | 多个测试重复创建 `UnitCell`、`Parameter`、`ESolver_LJ`、积分器和数组；抽成夹具后，后续增加线程一致性测试和 benchmark 辅助测试更直接。 | 测试断言主体保持不变；用 RAII 管理测试对象，消除手动释放和遗漏释放风险。 |
+| R6 | 移除测试 CMake 重复依赖 | `source/source_md/test/CMakeLists.txt` | `global_variable.cpp` 在 `depend_files` 中重复出现，删除重复项减少测试目标链接输入噪声。 | 不改变测试目标语义。 |
+
+## 保留旧接口的约束
+
+- `Run_MD::md_line()` 的公开函数签名不变。
+- `MD_func::current_temp()`、`MD_func::compute_stress()`、`MD_func::temp_vector()` 等旧接口保留。
+- DP/NEP 的 `before_all_runners()`、`runner()`、`cal_energy()`、`cal_force()`、`cal_stress()` 接口不变。
+- MD 测试中的数值断言未改写。
+
+## 未做事项
+
+- 未添加 `#pragma omp`。
+- 未调整 MPI 广播、MD 时间步顺序、温控/压控算法。
+- 未修改 DPMD/NEP 外部库调用语义。
+- 未加入 CUDA kernel。
+- 未把规划文件 `Planners/` 纳入提交。
+
+## 验证记录
+
+生产代码构建：
+
+```bash
+cmake -S . -B build-prod -DBUILD_TESTING=OFF -DENABLE_MPI=OFF -DENABLE_LCAO=OFF -DUSE_ELPA=OFF
+cmake --build build-prod -j2
+```
+
+结果：`abacus_pw_ser` 构建成功。
+
+MPI 测试配置：
+
+```bash
+cmake -S . -B build-mpi-test -DBUILD_TESTING=ON
+```
+
+结果：配置成功。普通沙箱下 OpenMPI 探测会打印 `opal_ifinit: socket() failed with errno=1`，因此该配置在允许 MPI 探测的环境中运行。
+
+MD 测试目标构建：
+
+```bash
+cmake --build build-mpi-test --target MODULE_MD_LJ_pot MODULE_MD_func MODULE_MD_fire MODULE_MD_verlet MODULE_MD_nhc MODULE_MD_msst MODULE_MD_lgv -j2
+```
+
+结果：7 个 MD 测试目标均构建成功。
+
+MD 单元测试：
+
+```bash
+ctest --test-dir build-mpi-test -R 'MODULE_MD' --output-on-failure
+```
+
+结果：7/7 通过。
+
+## 回溯方式
+
+本轮修改按重构主题提交，若后续发现问题，可用 `git log --oneline` 找到对应提交，再用 `git revert <commit>` 回退单个重构点。

From ca3dcb13c4b4f3131526552d783c6074f98979bd Mon Sep 17 00:00:00 2001
From: Audrey-777 <3043005175chy@gmail.com>
Date: Sun, 31 May 2026 11:46:48 +0800
Subject: [PATCH 05/12] optimize: add OpenMP to MD base loops and NEP interface

---
 source/source_esolver/esolver_nep.cpp | 60 ++++++++++++++++++++-------
 source/source_esolver/esolver_nep.h   |  2 +
 source/source_md/md_base.cpp          |  8 +++-
 source/source_md/md_func.cpp          | 49 ++++++++++++++++++----
 4 files changed, 92 insertions(+), 27 deletions(-)

diff --git a/source/source_esolver/esolver_nep.cpp b/source/source_esolver/esolver_nep.cpp
index 757fb6bd8fc..9e57a5c4cda 100644
--- a/source/source_esolver/esolver_nep.cpp
+++ b/source/source_esolver/esolver_nep.cpp
@@ -24,7 +24,6 @@
 #include "source_io/module_output/cif_io.h"
 
 #include <algorithm>
-#include <numeric>
 #include <unordered_map>
 
 using namespace ModuleESolver;
@@ -41,6 +40,20 @@ void ESolver_NEP::before_all_runners(UnitCell& ucell, const Input_para& inp)
     _e.resize(ucell.nat);
     _f.resize(3 * ucell.nat);
     _v.resize(9 * ucell.nat);
+    atom_type_index.resize(ucell.nat);
+    atom_local_index.resize(ucell.nat);
+
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            atom_type_index[iat] = it;
+            atom_local_index[iat] = ia;
+            ++iat;
+        }
+    }
+    assert(ucell.nat == iat);
 
     ModuleIO::CifParser::write(PARAM.globalv.global_out_dir + "STRU.cif", 
                                ucell, 
@@ -72,19 +85,16 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
 
     // coord
     nep_coord.resize(3 * ucell.nat);
-    int iat = 0;
     const int nat = ucell.nat;
-    for (int it = 0; it < ucell.ntype; ++it)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int iat = 0; iat < nat; ++iat)
     {
-        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
-        {
-            nep_coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
-            nep_coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
-            nep_coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
-            iat++;
-        }
+        const int it = atom_type_index[iat];
+        const int ia = atom_local_index[iat];
+        nep_coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+        nep_coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+        nep_coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
     }
-    assert(ucell.nat == iat);
 
 #ifdef __NEP
     nep_potential = 0.0;
@@ -100,11 +110,18 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
 
 
     // potential energy
-    nep_potential = fact_e * std::accumulate(_e.begin(), _e.end(), 0.0) ;
+    double energy_sum = 0.0;
+#pragma omp parallel for reduction(+:energy_sum) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        energy_sum += _e[i];
+    }
+    nep_potential = fact_e * energy_sum;
     GlobalV::ofs_running << " #TOTAL ENERGY# " << std::setprecision(11) << nep_potential * ModuleBase::Ry_to_eV << " eV"
                          << std::endl;
     
     // forces
+#pragma omp parallel for schedule(static) if (nat >= 256)
     for (int i = 0; i < nat; ++i)
     {
         nep_force(i, 0) = _f[i] * fact_f;
@@ -114,12 +131,23 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
 
     // virial
     std::fill(nep_virial_sum.begin(), nep_virial_sum.end(), 0.0);
-    for (int j = 0; j < 9; ++j)
+#pragma omp parallel if (nat >= 256)
     {
-        for (int i = 0; i < nat; ++i)
+        double local_virial_sum[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+
+#pragma omp for schedule(static)
+        for (int index = 0; index < 9 * nat; ++index)
+        {
+            const int j = index / nat;
+            local_virial_sum[j] += _v[index];
+        }
+
+#pragma omp critical
         {
-            int index = j * nat + i;
-            nep_virial_sum[j] += _v[index];
+            for (int j = 0; j < 9; ++j)
+            {
+                nep_virial_sum[j] += local_virial_sum[j];
+            }
         }
     }
 
diff --git a/source/source_esolver/esolver_nep.h b/source/source_esolver/esolver_nep.h
index 24d14b2d3ee..bcbd658c319 100644
--- a/source/source_esolver/esolver_nep.h
+++ b/source/source_esolver/esolver_nep.h
@@ -95,6 +95,8 @@ class ESolver_NEP : public ESolver
 
     std::string nep_file;                ///< directory of NEP model file
     std::vector<int> atype = {};         ///< atom type mapping for NEP model
+    std::vector<int> atom_type_index;     ///< global atom index to UnitCell atom type
+    std::vector<int> atom_local_index;    ///< global atom index to local index inside atom type
     double nep_potential;                ///< computed potential energy
     ModuleBase::matrix nep_force;        ///< computed atomic forces
     ModuleBase::matrix nep_virial;       ///< computed lattice virials
diff --git a/source/source_md/md_base.cpp b/source/source_md/md_base.cpp
index 390e1c2b082..f87fca9d2ee 100644
--- a/source/source_md/md_base.cpp
+++ b/source/source_md/md_base.cpp
@@ -96,7 +96,9 @@ void MD_base::update_pos()
 {
     if (my_rank == 0)
     {
-        for (int i = 0; i < ucell.nat; ++i)
+        const int natom = ucell.nat;
+#pragma omp parallel for schedule(static) if (natom >= 256)
+        for (int i = 0; i < natom; ++i)
         {
             for (int k = 0; k < 3; ++k)
             {
@@ -127,7 +129,9 @@ void MD_base::update_vel(const ModuleBase::Vector3<double>* force)
 {
     if (my_rank == 0)
     {
-        for (int i = 0; i < ucell.nat; ++i)
+        const int natom = ucell.nat;
+#pragma omp parallel for schedule(static) if (natom >= 256)
+        for (int i = 0; i < natom; ++i)
         {
             for (int k = 0; k < 3; ++k)
             {
diff --git a/source/source_md/md_func.cpp b/source/source_md/md_func.cpp
index 5089b1d8d64..3c7526cd9c2 100644
--- a/source/source_md/md_func.cpp
+++ b/source/source_md/md_func.cpp
@@ -44,6 +44,7 @@ double kinetic_energy(const int& natom, const ModuleBase::Vector3<double>* vel,
 {
     double ke = 0;
 
+#pragma omp parallel for reduction(+:ke) schedule(static) if (natom >= 256)
     for (int ion = 0; ion < natom; ++ion)
     {
         ke += 0.5 * allmass[ion] * vel[ion].norm2();
@@ -300,7 +301,9 @@ void force_virial(ModuleESolver::ESolver* p_esolver,
     force_temp *= 0.5;
     virial *= 0.5;
 
-    for (int i = 0; i < unit_in.nat; ++i)
+    const int natom = unit_in.nat;
+#pragma omp parallel for schedule(static) if (natom >= 256)
+    for (int i = 0; i < natom; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
@@ -503,16 +506,44 @@ void temp_vector(const int& natom,
 {
     t_vector.create(3, 3);
 
+    double t00 = 0.0;
+    double t01 = 0.0;
+    double t02 = 0.0;
+    double t10 = 0.0;
+    double t11 = 0.0;
+    double t12 = 0.0;
+    double t20 = 0.0;
+    double t21 = 0.0;
+    double t22 = 0.0;
+
+#pragma omp parallel for reduction(+:t00, t01, t02, t10, t11, t12, t20, t21, t22) schedule(static) if (natom >= 256)
     for (int ion = 0; ion < natom; ++ion)
     {
-        for (int i = 0; i < 3; ++i)
-        {
-            for (int j = 0; j < 3; ++j)
-            {
-                t_vector(i, j) += allmass[ion] * vel[ion][i] * vel[ion][j];
-            }
-        }
-    }
+        const double mass = allmass[ion];
+        const double vx = vel[ion].x;
+        const double vy = vel[ion].y;
+        const double vz = vel[ion].z;
+
+        t00 += mass * vx * vx;
+        t01 += mass * vx * vy;
+        t02 += mass * vx * vz;
+        t10 += mass * vy * vx;
+        t11 += mass * vy * vy;
+        t12 += mass * vy * vz;
+        t20 += mass * vz * vx;
+        t21 += mass * vz * vy;
+        t22 += mass * vz * vz;
+    }
+
+    t_vector(0, 0) = t00;
+    t_vector(0, 1) = t01;
+    t_vector(0, 2) = t02;
+    t_vector(1, 0) = t10;
+    t_vector(1, 1) = t11;
+    t_vector(1, 2) = t12;
+    t_vector(2, 0) = t20;
+    t_vector(2, 1) = t21;
+    t_vector(2, 2) = t22;
 
     return;
 }

From 19117e05efb5439400bfa8738d15902b1747d0a4 Mon Sep 17 00:00:00 2001
From: Audrey-777 <3043005175chy@gmail.com>
Date: Sun, 31 May 2026 11:47:54 +0800
Subject: [PATCH 06/12] docs: record OpenMP NEP and MD base changes

---
 Planners/0531/openmp_nep_basic_changes.md | 54 +++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 Planners/0531/openmp_nep_basic_changes.md

diff --git a/Planners/0531/openmp_nep_basic_changes.md b/Planners/0531/openmp_nep_basic_changes.md
new file mode 100644
index 00000000000..81885adce20
--- /dev/null
+++ b/Planners/0531/openmp_nep_basic_changes.md
@@ -0,0 +1,54 @@
+# OpenMP 基本部分与 NEP 优化修改说明
+
+日期：2026-05-31
+
+本次根据 `Planners/0531/md_parallel_optimization_plan.md`，只实现了收益较明确、数据依赖简单的 OpenMP 优化点；暂未处理随机数相关循环、NHC/MSST 扩展项、DPMD 和 LJ。
+
+## 修改范围
+
+### 基本 MD 循环
+
+- `source/source_md/md_base.cpp`
+  - `MD_base::update_pos()`：对 rank 0 上的逐原子位置增量计算加入 `#pragma omp parallel for schedule(static)`。
+  - `MD_base::update_vel()`：对 rank 0 上的逐原子速度半步更新加入 OpenMP 并行。
+  - MPI 广播和 `unitcell::update_pos_taud()` 顺序保持不变。
+
+- `source/source_md/md_func.cpp`
+  - `kinetic_energy()`：使用 OpenMP `reduction(+:ke)` 并行动能求和。
+  - `force_virial()`：并行 `force_temp` 到 `force` 的逐原子回填。
+  - `temp_vector()`：用 9 个标量 reduction 并行温度张量累加，并显式写回 3x3 矩阵，避免依赖 `matrix::create()` 是否清零。
+
+### NEP 接口层
+
+- `source/source_esolver/esolver_nep.h`
+  - 新增 `atom_type_index` 和 `atom_local_index` 缓存，用于把全局原子序号映射到 `UnitCell` 的元素类型和类型内局部序号。
+
+- `source/source_esolver/esolver_nep.cpp`
+  - `before_all_runners()`：初始化全局原子到 `UnitCell` 存储位置的索引缓存。
+  - `runner()`：
+    - 并行 NEP 坐标缓冲区填充；
+    - 使用 OpenMP reduction 并行每原子能量求和；
+    - 并行 NEP 力回填和单位转换；
+    - 使用线程局部 9 分量数组并行 per-atom virial 求和。
+
+## 并行策略
+
+- 所有新增循环使用 `schedule(static)`。
+- 对短循环使用 `if (nat >= 256)` 或同类阈值，降低小体系线程启动开销。
+- 不改变 MD 时间步顺序、MPI 广播位置、NEP 外部库调用边界。
+- 浮点归约可能带来末位差异，后续测试应使用容差比较。
+
+## 未执行项
+
+- 按要求未运行测试。
+- 未并行 `rand_vel()`、Anderson thermostat、Langevin 随机力等随机数相关循环。
+- 未改 DPMD、NHC/MSST 扩展优化、LJ benchmark 优化。
+
+## 回溯方式
+
+本次修改已按 git commit 分开记录：
+
+- 代码优化提交：`optimize: add OpenMP to MD base loops and NEP interface`
+- 文档提交：`docs: record OpenMP NEP and MD base changes`
+
+如需回滚，可对对应提交执行 `git revert <commit>`。

From 5f60d472caa1f6c1ef09a9e8264fcb2cac449d87 Mon Sep 17 00:00:00 2001
From: Audrey-777 <3043005175chy@gmail.com>
Date: Tue, 2 Jun 2026 12:15:07 +0800
Subject: [PATCH 07/12] docs: add MD OpenMP planning and benchmark results

---
 Planners/0530/0530_refractor.md               | 179 ++++
 .../0531/md_parallel_optimization_plan.md     | 599 +++++++++++
 Planners/0531/openmp_nep_basic_changes.md     |  54 -
 Planners/06_md.md                             | 960 ++++++++++++++++++
 Planners/Project_plan.md                      | 388 +++++++
 Results/0530_md_pre_parallel_refactor.md      |  12 +-
 Results/0531_openmp_nep_basic_changes.md      | 209 ++++
 Test/openmp_nep_basic_benchmark.cpp           | 611 +++++++++++
 Test/results/openmp_nep_basic_benchmark.csv   |  37 +
 Test/results/run_1.csv                        |  10 +
 Test/results/run_2.csv                        |  10 +
 Test/results/run_4.csv                        |  10 +
 Test/results/run_8.csv                        |  10 +
 Test/run_openmp_nep_basic_benchmark.sh        |  41 +
 source/source_esolver/esolver_nep.cpp         |  47 +-
 source/source_md/test/md_test_fixture.h       |   1 +
 16 files changed, 3096 insertions(+), 82 deletions(-)
 create mode 100644 Planners/0530/0530_refractor.md
 create mode 100644 Planners/0531/md_parallel_optimization_plan.md
 delete mode 100644 Planners/0531/openmp_nep_basic_changes.md
 create mode 100644 Planners/06_md.md
 create mode 100644 Planners/Project_plan.md
 create mode 100644 Results/0531_openmp_nep_basic_changes.md
 create mode 100644 Test/openmp_nep_basic_benchmark.cpp
 create mode 100644 Test/results/openmp_nep_basic_benchmark.csv
 create mode 100644 Test/results/run_1.csv
 create mode 100644 Test/results/run_2.csv
 create mode 100644 Test/results/run_4.csv
 create mode 100644 Test/results/run_8.csv
 create mode 100755 Test/run_openmp_nep_basic_benchmark.sh

diff --git a/Planners/0530/0530_refractor.md b/Planners/0530/0530_refractor.md
new file mode 100644
index 00000000000..09c9dc79340
--- /dev/null
+++ b/Planners/0530/0530_refractor.md
@@ -0,0 +1,179 @@
+# ABACUS 重构 PR —— 三人分工方案
+
+## 零冲突保证
+
+四个重构点修改的文件集合经逐文件核对，**完全不相交**。三人可在不同分支并行工作，不会产生 git merge 冲突。
+
+---
+
+## 分工总览
+
+| 人   | 重构编号 | 内容                           | 涉及文件数   | 难度  | 分支名                           |
+| ---- | -------- | ------------------------------ | ------------ | ----- | -------------------------------- |
+| A    | #5 + #9  | 积分器工厂 + DP/NEP 缓冲区复用 | 6            | 低-中 | `refactor/md-factory-and-buffer` |
+| B    | #7       | 统计纯函数                     | 6（+1 新建） | 中    | `refactor/md-pure-statistics`    |
+| C    | #12      | 测试夹具                       | 8（+1 新建） | 低    | `refactor/md-test-fixtures`      |
+
+---
+
+## 人 A：`#5 积分器工厂` + `#9 缓冲区复用`
+
+| 文件                                    | 操作                                  | 参考章节（`代码修改与重构.md`） |
+| --------------------------------------- | ------------------------------------- | ------------------------------- |
+| `source/source_md/run_md.cpp`           | 修改 — 抽取工厂函数 + unique_ptr      | §3.3                            |
+| `source/source_md/run_md.h`             | 修改 — 前向声明 MD_base               | §3.4                            |
+| `source/source_esolver/esolver_dp.h`    | 修改 — 新增 cell/coord/f/v 缓冲区成员 | §2.3                            |
+| `source/source_esolver/esolver_dp.cpp`  | 修改 — 复用成员缓冲区                 | §2.4                            |
+| `source/source_esolver/esolver_nep.h`   | 修改 — 新增 cell/coord 缓冲区成员     | §2.5                            |
+| `source/source_esolver/esolver_nep.cpp` | 修改 — 复用成员缓冲区                 | §2.6                            |
+
+**提交信息**：
+
+```
+refactor: extract MD integrator factory and reuse DP/NEP buffers
+
+- Replace manual new/delete in md_line() with factory function + unique_ptr
+- Move DP/NEP per-step cell/coord/f/v allocations to member buffers
+  allocated once in before_all_runners()
+
+Co-Authored-By: ... <...>
+```
+
+**建议两个 commit**：#5 和 #9 分别提交，共用一个分支。
+
+---
+
+## 人 B：`#7 抽取温度、动能、应力统计为纯计算组件`
+
+| 文件                               | 操作                                    | 参考章节（`代码修改与重构.md`） |
+| ---------------------------------- | --------------------------------------- | ------------------------------- |
+| `source/source_md/md_statistics.h` | **新建**                                | §4.3                            |
+| `source/source_md/md_func.h`       | 修改 — 新增纯函数声明                   | §4.4                            |
+| `source/source_md/md_func.cpp`     | 修改 — 新增纯函数实现，旧接口改为包装器 | §4.5                            |
+| `source/source_md/nhchain.cpp`     | 修改 — 调用点改用纯函数                 | §4.6                            |
+| `source/source_md/msst.cpp`        | 修改 — 调用点改用纯函数                 | §4.7                            |
+| `source/source_md/fire.cpp`        | 可选 — `calc_fire_projection` 纯函数    | §4.8                            |
+
+**提交信息**：
+
+```
+refactor: extract MD statistics as pure functions
+
+- Add md_statistics.h with MDKineticState, MDStressState, FIREProjection
+- Add calc_kinetic_state() and calc_stress_state() pure functions
+- Old interfaces (current_temp, compute_stress) kept as wrappers
+- Update nhchain.cpp and msst.cpp call sites
+
+Co-Authored-By: ... <...>
+```
+
+**注意事项**：
+
+1. `calc_stress_state` 中 `state.t_vector.create(3,3)` 之后直接做 `+=`，需确认 `create` 是否清零。如果不清零，需在 create 后手动把 t_vector 元素置零。
+2. `fire.cpp` 的修改是可选的，稳妥起见可以先不改，留在后续 PR。
+
+---
+
+## 人 C：`#12 建立测试夹具/benchmark 夹具`
+
+| 文件                                      | 操作                                           | 参考章节（`代码修改与重构.md`） |
+| ----------------------------------------- | ---------------------------------------------- | ------------------------------- |
+| `source/source_md/test/md_test_fixture.h` | **新建**                                       | §1.3                            |
+| `source/source_md/test/verlet_test.cpp`   | 修改 — 继承 MdIntegratorFixture\<Verlet\>      | §1.4                            |
+| `source/source_md/test/fire_test.cpp`     | 修改 — 继承 MdIntegratorFixture\<FIRE\>        | §1.5                            |
+| `source/source_md/test/langevin_test.cpp` | 修改 — 继承 MdIntegratorFixture\<Langevin\>    | §1.6                            |
+| `source/source_md/test/msst_test.cpp`     | 修改 — 继承 MdIntegratorFixture\<MSST\>        | §1.7                            |
+| `source/source_md/test/nhchain_test.cpp`  | 修改 — 继承 MdTestBase，手动创建 Nose_Hoover   | §1.8                            |
+| `source/source_md/test/md_func_test.cpp`  | 修改 — 继承 MdFuncTestFixture，数组改用 vector | §1.9                            |
+| `source/source_md/test/lj_pot_test.cpp`   | 修改 — 继承 LjPotTestFixture                   | §1.10                           |
+| `source/source_md/test/CMakeLists.txt`    | 修改 — 移除重复的 `global_variable.cpp` 行     | §1.11                           |
+
+**提交信息**：
+
+```
+refactor: introduce MD test fixtures to reduce duplication
+
+- Add md_test_fixture.h with MdTestBase, MdIntegratorFixture<T>,
+  MdFuncTestFixture, LjPotTestFixture
+- Replace manual new/delete in all MD test SetUp/TearDown with RAII
+- Remove duplicate dependency in test CMakeLists.txt
+
+Co-Authored-By: ... <...>
+```
+
+**注意事项**：
+
+1. `nhchain_test.cpp` 的 NHC 测试没有继承 `MdIntegratorFixture<Nose_Hoover>`，而是直接继承 `MdTestBase` 再手动创建积分器，因为 NHC 需要在 SetUp 中额外设置 NPT 参数。这在 `代码修改与重构.md` §1.8 的代码里已经体现了。
+2. 各测试的 TEST_F 主体代码**完全不变**，只有 `dynamic_cast<XXX*>(mdrun)` 改成 `dynamic_cast<XXX*>(mdrun.get())`。
+3. 做完后运行 `ctest --test-dir build -R 'MODULE_MD' --output-on-failure` 确认所有测试通过。
+
+---
+
+## Git 操作流程
+
+### 前置准备（一人做）
+
+```bash
+# 在 GitHub 网页上 Fork https://github.com/deepmodeling/abacus-develop
+# 然后：
+git clone https://github.com/<fork-user>/abacus-develop.git abacus-pr
+cd abacus-pr
+git remote add upstream https://github.com/deepmodeling/abacus-develop.git
+```
+
+三人可以 clone 同一个 fork 到各自目录，或共用同一个 clone 开不同分支。
+
+### 各自操作（三人完全并行）
+
+```bash
+cd abacus-pr
+
+# 1. 从最新 main 出发
+git checkout main
+git pull upstream main
+
+# 2. 创建独立分支
+git checkout -b <你的分支名>
+
+# 3. 修改文件（参考上方分工表和"代码修改与重构.md"对应章节）
+
+# 4. 构建验证
+cmake -S . -B build -DBUILD_TESTING=ON
+cmake --build build -j$(nproc)
+
+# 5. 运行测试
+ctest --test-dir build -R 'MODULE_MD' --output-on-failure
+
+# 6. 提交并推送
+git add <修改的文件>
+git commit -m "<提交信息>"
+git push -u origin <你的分支名>
+```
+
+### 在 GitHub 创建 PR
+
+推送后在 GitHub 网页上：
+
+- 进入你的 fork 页面
+- 点 "Compare & pull request"
+- base 选 `deepmodeling/abacus-develop` 的 `main` 分支
+- head 选你刚推的分支
+- PR 标题用提交信息的第一行
+- 正文写清楚改动目的和范围
+
+---
+
+## 建议 PR 顺序
+
+1. **C 先提** — 测试夹具最安全，reviewer 最容易通过
+2. **B 接着提** — 统计纯函数依赖旧接口包装器兼容
+3. **A 最后提** — 工厂和缓冲区分属两个独立改动，合在一个 PR 也可以
+
+三人也可以同时提（文件不冲突），但分先后能避免 reviewer 困惑。
+
+---
+
+## 额外可做（进度快时）
+
+- **#11 FIRE 参数覆盖 bugfix**：`fire.cpp` 构造函数里 `force_thr = 1e-3` 覆盖了用户输入。这是一行修复，可以分给人 B 顺便做，单独一个 commit。只改 `fire.cpp`，不与其他人的改动冲突。
+- **#8 ESolver 结果与日志分离**：改 `esolver_dp.cpp`、`esolver_nep.cpp`，和人 A 的 #9 有同一文件冲突，不建议今晚做。
\ No newline at end of file
diff --git a/Planners/0531/md_parallel_optimization_plan.md b/Planners/0531/md_parallel_optimization_plan.md
new file mode 100644
index 00000000000..d2142a55295
--- /dev/null
+++ b/Planners/0531/md_parallel_optimization_plan.md
@@ -0,0 +1,599 @@
+# ABACUS MD 模块并行优化方案
+
+日期：2026-05-31
+范围：以 `source/source_md` 为核心，覆盖其直接关联的 `source/source_esolver`、`source/source_cell`、构建与测试代码。
+原则：优先选择实现容易、收益明确、正确性风险低的并行点；OpenMP 优先，MPI 只作为兼容与后续扩展方向；本阶段不安排 CUDA 编程。
+
+---
+
+## 一、相对既有项目计划的调整
+
+本方案以 `Planners/06_md.md` 的总任务为目标约束，以 `Planners/Project_plan.md` 为基础，但根据当前代码实际做如下调整：
+
+| 调整项 | `Project_plan.md` 中的表述 | 本方案调整 | 原因 |
+| --- | --- | --- | --- |
+| CUDA | 作为有余力时的扩展尝试 | 本阶段明确暂缓，不列入实现主线 | 用户要求暂时不考虑 CUDA；且当前易加速点主要是 CPU 侧数据转换和归约 |
+| 势函数专项 | DPMD 或 NEP 二选一深入 | 建议优先 NEP 接口层，DPMD 作为同类同步优化对象 | NEP 当前有能量 `accumulate`、力回填、virial 求和三类典型循环，OpenMP 收益更直接；DPMD 主要是坐标与力回填，virial 仅 3x3 |
+| Langevin/Andersen | 原计划把 Langevin 作为候选阅读对象 | 本方案暂不并行随机数生成循环，只保留阻尼力/后处理改造的预研 | `std::rand()` 和 `MD_func::gaussrand()` 依赖全局状态与调用顺序，直接并行会改变随机序列并引入数据竞争 |
+| MPI | 课程说明中包含 MPI 层级 | 本方案不做 MD 域分解，只保证 OpenMP 与现有 MPI 广播兼容 | 当前 MD 积分器在 `my_rank == 0` 上更新再 `MPI_Bcast`，引入 MPI 原子分解会改变数据所有权，工作量和风险明显高于本阶段收益 |
+| 测试 | 建立测试与 benchmark 框架 | 先复用已有 `MODULE_MD_*` 单测并增加线程一致性脚本，再做完整 benchmark | 当前仓库已有 MD 单测入口，先复用成本最低 |
+| 新增候选 | 原计划未强调 LJ 势 | 将 LJ 作为可选 P2 优化和 benchmark 参考 | LJ 不依赖 DPMD/NEP 外部库，适合在没有模型文件时评估 MD 主循环和 OpenMP 正确性 |
+
+---
+
+## 二、当前代码事实与并行边界
+
+MD 主循环在 `source/source_md/run_md.cpp` 中完成。`Run_MD::md_line()` 根据 `md_type` 创建 `FIRE`、`Nose_Hoover`、`Verlet`、`Langevin`、`MSST`，每步依次执行 `first_half()`、`MD_func::force_virial()`、`second_half()`、`compute_stress()`、`current_temp()`、输出与 restart。关键依赖顺序在 `run_md.cpp:57-136`，不建议改变。
+
+积分器公共状态在 `MD_base`。`MD_base::update_pos()` 的位置增量循环位于 `md_base.cpp:95-120`，`MD_base::update_vel()` 的速度半步循环位于 `md_base.cpp:126-144`。两者当前只在 `my_rank == 0` 上串行更新，然后广播给所有 MPI rank。OpenMP 应只包住 rank 0 内部循环，广播和 `unitcell::update_pos_taud()` 保持原顺序。
+
+统计与力转换集中在 `MD_func`。`kinetic_energy()` 位于 `md_func.cpp:43-52`，`temp_vector()` 位于 `md_func.cpp:499-518`，`force_virial()` 中 `force_temp` 到 `Vector3` 的回填位于 `md_func.cpp:275-313`。这些都是 OpenMP 的低风险入口。
+
+DPMD 接口在 `source/source_esolver/esolver_dp.cpp`。每步包括 cell 写入、坐标转换、外部 `dp.compute()`、能量/力/virial 单位转换。坐标转换在 `esolver_dp.cpp:76-88`，力回填在 `esolver_dp.cpp:108-113`。
+
+NEP 接口在 `source/source_esolver/esolver_nep.cpp`。坐标转换在 `esolver_nep.cpp:73-87`，能量求和在 `esolver_nep.cpp:102-104`，力回填在 `esolver_nep.cpp:107-113`，virial 求和在 `esolver_nep.cpp:115-133`。这是机器学习势函数接口层最值得优先处理的区域。
+
+随机数相关循环需要保守处理。`Verlet::apply_thermostat()` 的 Anderson 分支在 `verlet.cpp:74-97` 使用 `std::rand()` 和 `MD_func::gaussrand()`；`Langevin::post_force()` 在 `langevin.cpp:86-107` 中每原子调用 `std::rand()`；`MD_func::rand_vel()` 在 `md_func.cpp:157-211` 中依赖串行高斯随机数。这些循环本阶段不直接 OpenMP 化。
+
+---
+
+## 三、优化目标
+
+1. 在不改变 MD 主循环和物理算法的前提下，提高大原子数体系中积分器、统计归约、势函数接口层数据转换的 CPU 利用率。
+2. 保持 OpenMP 关闭时行为完全兼容；OpenMP 开启时与现有 MPI 广播逻辑兼容。
+3. 对确定性计算路径做到多线程结果在数值容差内一致；对随机数路径不改变随机序列。
+4. 优先完成能被已有单元测试覆盖的改动，再推进依赖外部 DPMD/NEP 库的测试。
+
+---
+
+## 四、优先级总览
+
+| 优先级 | 优化点 | 文件/函数 | OpenMP 策略 | 收益判断 | 风险 |
+| --- | --- | --- | --- | --- | --- |
+| P0 | 基础积分器位置更新 | `MD_base::update_pos()` | `parallel for schedule(static)`，仅 rank 0 | 每步必经，所有 MD 类型共享 | 低 |
+| P0 | 基础积分器速度更新 | `MD_base::update_vel()` | `parallel for schedule(static)`，仅 rank 0 | 每步至少两次，所有 MD 类型共享 | 低 |
+| P0 | 动能与温度归约 | `MD_func::kinetic_energy()` | `reduction(+:ke)` | `current_temp()` 多处调用 | 低，浮点归约有微小顺序差 |
+| P0 | 温度张量归约 | `MD_func::temp_vector()` | 9 个局部标量 reduction 或线程局部 3x3 后合并 | NPT/MSST stress 计算依赖 | 中低 |
+| P0 | 力矩阵回填 | `MD_func::force_virial()` | `parallel for` 回填 `force[i][j]` | 每步势函数后必经 | 低 |
+| P1 | Verlet 温控缩放 | `Verlet::thermalize()` | `parallel for` 缩放 `vel[i]` | NVT rescaling/berendsen 低成本提速 | 低 |
+| P1 | FIRE 归约和速度更新 | `FIRE::check_force()`、`FIRE::check_fire()` | `max`、`+` reduction + `parallel for` | 弛豫任务常用；循环清晰 | 中低 |
+| P1 | NEP 接口层 | `ESolver_NEP::runner()` | 坐标/力回填并行，能量/virial reduction | 机器学习势主线，收益明确 | 中 |
+| P1 | DPMD 接口层 | `ESolver_DP::runner()` | 坐标/力回填并行 | 与 NEP 同类，可同步完成 | 中低 |
+| P2 | NHC 原子速度缩放 | `particle_thermo()`、`vel_baro()` | 串行计算 scale/factor，原子循环并行 | NVT/NPT 常用 | 中 |
+| P2 | MSST 原子循环 | `vel_sum()`、`rescale()`、`propagate_vel()` | `reduction` + `parallel for` | 特定场景提速 | 中 |
+| P2 | LJ 势参考优化 | `ESolver_LJ::runner()` | 原子邻居循环并行，能量/virial reduction | 无外部库 benchmark | 中高 |
+| 暂缓 | 随机数循环 | `rand_vel()`、Anderson、Langevin | 不直接并行 | 避免改变随机序列和线程安全问题 | 高 |
+
+---
+
+## 五、详细优化方案
+
+### 5.1 OpenMP 接入与通用写法
+
+仓库顶层 `CMakeLists.txt` 已有 `USE_OPENMP` 选项，默认开启，并在 `CMakeLists.txt:398-400` 查找和链接 `OpenMP::OpenMP_CXX`。测试框架 `cmake/Testing.cmake` 也会给单测目标链接 OpenMP。因此不需要新增全局构建选项，只需在代码中使用标准 pragma。
+
+建议写法：
+
+```cpp
+#pragma omp parallel for schedule(static) if (natom >= 256)
+for (int i = 0; i < natom; ++i)
+{
+    ...
+}
+```
+
+实现要求：
+
+1. 不新增必须依赖 OpenMP API 的逻辑；只使用 pragma 时，未开启 OpenMP 也可编译为串行。
+2. 对短循环增加 `if (natom >= 256)` 或同类阈值，避免小体系线程启动开销。
+3. 默认使用 `schedule(static)`，保持可重复性和缓存局部性。
+4. 所有 reduction 接受浮点求和顺序差异，测试使用绝对/相对容差，不要求 bitwise 相同。
+5. 不在已经可能被外部库 OpenMP 并行的区域包大范围 `parallel`，避免嵌套并行过度订阅。
+
+### 5.2 P0：`MD_base::update_pos()` 并行化
+
+位置：`source/source_md/md_base.cpp:95-120`
+
+当前逻辑：
+
+1. rank 0 遍历所有原子；
+2. 根据 `ionmbl[i][k]` 计算速度导致的位置增量；
+3. 用 `ucell.GT` 转为直接坐标增量；
+4. MPI 广播 `pos`；
+5. 调用 `unitcell::update_pos_taud()` 更新 `UnitCell` 中的原子位置。
+
+并行方案：
+
+```cpp
+if (my_rank == 0)
+{
+    const int nat = ucell.nat;
+    #pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        ...
+    }
+}
+```
+
+正确性依据：
+
+- 每次迭代只写 `pos[i]`，读取 `vel[i]`、`ionmbl[i]`、`md_dt`、`ucell.lat0`、`ucell.GT`。
+- 不改变 MPI 广播位置；所有 rank 仍收到 rank 0 的完整 `pos`。
+- 不并行 `unitcell::update_pos_taud()`，因为它修改 `ucell.atoms`，应先保持原串行语义。
+
+预期收益：所有 MD 类型每步至少调用一次，原子数大时可获得稳定但有限的加速；对机器学习势主导的总耗时贡献不一定最大，但风险最低。
+
+### 5.3 P0：`MD_base::update_vel()` 并行化
+
+位置：`source/source_md/md_base.cpp:126-144`
+
+并行方案与 `update_pos()` 类似：
+
+- rank 0 内部对 `i` 并行；
+- 每个线程只更新 `vel[i]`；
+- `force[i]`、`allmass[i]`、`ionmbl[i]` 只读；
+- 保持 `MPI_Bcast(vel, ...)` 在并行区之后。
+
+注意点：
+
+- `update_vel()` 每个常规时间步通常调用两次，是最值得先做的共享积分器优化。
+- 固定自由度 `ionmbl[i][k] == 0` 保持不更新。
+- 不要把 `MPI_Bcast` 放入 OpenMP 并行区。
+
+### 5.4 P0：MD 统计归约并行化
+
+#### 5.4.1 `kinetic_energy()`
+
+位置：`source/source_md/md_func.cpp:43-52`
+
+并行方案：
+
+```cpp
+double ke = 0.0;
+#pragma omp parallel for reduction(+:ke) schedule(static) if (natom >= 256)
+for (int ion = 0; ion < natom; ++ion)
+{
+    ke += 0.5 * allmass[ion] * vel[ion].norm2();
+}
+```
+
+调用链：
+
+- `calc_kinetic_state()`；
+- `current_temp()`；
+- `Verlet::apply_thermostat()`；
+- `Nose_Hoover::first_half()`、`second_half()`；
+- `MSST::second_half()`；
+- `MD_base::print_md()`。
+
+#### 5.4.2 `temp_vector()`
+
+位置：`source/source_md/md_func.cpp:499-518`
+
+当前 `t_vector(i,j) += ...` 会对同一个 3x3 矩阵累加，不能直接 `parallel for` 写共享矩阵。
+
+建议方案：使用 9 个局部 reduction 标量，最后写回矩阵。
+
+```cpp
+double t00 = 0.0, t01 = 0.0, ... , t22 = 0.0;
+#pragma omp parallel for reduction(+:t00,t01,t02,t10,t11,t12,t20,t21,t22) schedule(static) if (natom >= 256)
+for (int ion = 0; ion < natom; ++ion)
+{
+    const double m = allmass[ion];
+    ...
+}
+t_vector(0,0) = t00;
+...
+```
+
+正确性注意：
+
+- `t_vector.create(3, 3)` 后不要依赖其是否清零，最终应显式赋值 9 个元素。
+- 这个改动会影响 `calc_stress_state()` 和 NPT/MSST 压力相关路径，测试应覆盖 `MODULE_MD_nhc`、`MODULE_MD_msst`。
+
+### 5.5 P0：`force_virial()` 力回填并行化
+
+位置：`source/source_md/md_func.cpp:275-313`
+
+`p_esolver->cal_force()` 返回 `ModuleBase::matrix force_temp` 后，当前串行拷贝到 `ModuleBase::Vector3<double>* force`。可并行化 `md_func.cpp:303-309` 的回填循环：
+
+```cpp
+const int nat = unit_in.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+for (int i = 0; i < nat; ++i)
+{
+    force[i][0] = force_temp(i, 0);
+    force[i][1] = force_temp(i, 1);
+    force[i][2] = force_temp(i, 2);
+}
+```
+
+保持不变的部分：
+
+- `p_esolver->runner()`、`cal_energy()`、`cal_force()`、`cal_stress()` 的调用顺序；
+- `potential *= 0.5`、`force_temp *= 0.5`、`virial *= 0.5` 的单位转换顺序。
+
+可选后续：若 `ModuleBase::matrix::operator*=(double)` 内部仍是串行，可单独评估是否替换为并行缩放。但这属于共享基础类型优化，先不扩散范围。
+
+### 5.6 P1：Verlet 温控缩放并行化
+
+位置：`source/source_md/verlet.cpp:110-126`
+
+`Verlet::thermalize()` 在 rescaling、rescale_v、berendsen 温控中对所有原子速度乘同一 `fac`。这是独立写 `vel[i]` 的典型循环，可直接 `parallel for`。
+
+不并行的部分：
+
+- `Verlet::apply_thermostat()` 的 Anderson 分支，位置 `verlet.cpp:74-97`，因为它调用 `std::rand()` 和 `MD_func::gaussrand()`。
+
+### 5.7 P1：FIRE 归约与更新并行化
+
+位置：
+
+- `source/source_md/fire.cpp:156-176`，`check_force()`
+- `source/source_md/fire.cpp:180-236`，`check_fire()`
+
+建议拆分：
+
+1. `check_force()` 使用 `reduction(max:max_force)` 获取最大力，再赋给成员 `max`。
+2. `check_fire()` 第一段对 `P`、`sumforce`、`normvel` 使用 `reduction(+:P,sumforce,normvel)`。
+3. `sumforce` 和 `normvel` 开方后，第二段速度更新使用 `parallel for`。
+4. `P <= 0` 分支中的速度清零也使用 `parallel for`。
+
+注意点：
+
+- `md_dt`、`alpha`、`negative_count` 的更新必须保持串行，在归约之后执行。
+- 若 `sumforce == 0`，原代码已有潜在除零风险，本方案不顺手改算法，只建议在实现时加测试覆盖或保留原行为。
+
+### 5.8 P1：NEP 接口层 OpenMP 优化
+
+位置：`source/source_esolver/esolver_nep.cpp:56-137`
+
+NEP 是本方案建议优先做的机器学习势函数方向。优化点包括坐标转换、能量求和、力回填和 virial 求和。
+
+#### 5.8.1 坐标转换
+
+当前坐标转换使用 `iat++` 串行编号，不能直接 `parallel for`。建议在 `before_all_runners()` 中构建一次扁平索引：
+
+- `atom_type_of_iat[iat] = it`
+- `atom_local_index_of_iat[iat] = ia`
+
+之后每步：
+
+```cpp
+const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+for (int iat = 0; iat < nat; ++iat)
+{
+    const int it = atom_type_of_iat[iat];
+    const int ia = atom_local_index_of_iat[iat];
+    nep_coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+    nep_coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+    nep_coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
+}
+```
+
+这个小索引也可用于 DPMD，避免多个接口重复手写 `iat++` 并行变体。
+
+#### 5.8.2 能量求和
+
+当前 `std::accumulate(_e.begin(), _e.end(), 0.0)` 位于 `esolver_nep.cpp:102-104`。改为 OpenMP reduction：
+
+```cpp
+double e_sum = 0.0;
+#pragma omp parallel for reduction(+:e_sum) schedule(static) if (nat >= 256)
+for (int i = 0; i < nat; ++i)
+{
+    e_sum += _e[i];
+}
+nep_potential = fact_e * e_sum;
+```
+
+注意：多线程浮点归约会与串行 `std::accumulate` 有末位差异，测试使用容差。
+
+#### 5.8.3 力回填
+
+位置：`esolver_nep.cpp:107-113`
+
+每个原子只写 `nep_force(i, 0..2)`，可直接 `parallel for`。
+
+#### 5.8.4 Virial 求和
+
+位置：`esolver_nep.cpp:115-133`
+
+当前外层 9 个分量、内层 nat 累加。建议写成 9 个分量各自 reduction，或者对 `j` 串行、`i` 并行 reduction：
+
+```cpp
+for (int j = 0; j < 9; ++j)
+{
+    double sum = 0.0;
+    #pragma omp parallel for reduction(+:sum) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        sum += _v[j * nat + i];
+    }
+    nep_virial_sum[j] = sum;
+}
+```
+
+如果担心 9 次 parallel region 开销，可使用线程局部 `double local[9]` 后合并，但实现复杂度略高。第一版建议用简单写法，之后根据 profile 决定是否合并并行区。
+
+### 5.9 P1：DPMD 接口层 OpenMP 优化
+
+位置：`source/source_esolver/esolver_dp.cpp:61-125`
+
+可复用 NEP 的扁平原子索引思路。
+
+优化点：
+
+1. `esolver_dp.cpp:76-88` 坐标转换并行化；
+2. `esolver_dp.cpp:108-113` `dp_model_force` 到 `dp_force` 的单位转换和回填并行化；
+3. `dp_virial` 只有 3x3，不需要 OpenMP；
+4. `type_map()` 位于 `esolver_dp.cpp:162-208`，只在初始化执行，不作为并行重点。
+
+注意：
+
+- 不进入 DeePMD-kit 的 `dp.compute()` 内部；外部库可能已有自己的线程策略。
+- 如果 DeePMD-kit 使用 OpenMP，需要 benchmark `OMP_NUM_THREADS` 对 ABACUS 外层和库内层的影响，避免线程过度订阅。
+
+### 5.10 P2：Nose-Hoover 原子循环并行化
+
+位置：
+
+- `Nose_Hoover::particle_thermo()` 的速度缩放，`nhchain.cpp:555-559`
+- `Nose_Hoover::vel_baro()` 的速度缩放与剪切修正，`nhchain.cpp:690-720`
+- `Nose_Hoover::update_baro()` 间接依赖 `MD_func::temp_vector()`，`nhchain.cpp:643-688`
+
+方案：
+
+1. thermostat chain 自身的 `eta`、`v_eta`、`g_eta` 积分循环保持串行，因为链变量有递推依赖。
+2. 计算出最终 `scale` 后，对所有 `vel[i] *= scale` 使用 `parallel for`。
+3. `vel_baro()` 中 `factor[3]` 先串行计算，然后对 `i` 并行；每个原子只写自己的速度。
+4. 不改 `update_volume()`，因为它修改晶格元素和调用 `unitcell::setup_cell_after_vc()`，顺序依赖强。
+
+风险控制：
+
+- NHC/NPT 物理量对数值误差敏感，必须运行 `MODULE_MD_nhc` 多线程一致性测试。
+- 先合入 P0/P1 后再做本项，便于定位误差来源。
+
+### 5.11 P2：MSST 原子循环并行化
+
+位置：
+
+- `MSST::vel_sum()`，`msst.cpp:239-249`
+- `MSST::rescale()` 中速度缩放，`msst.cpp:251-269`
+- `MSST::propagate_vel()`，`msst.cpp:272-309`
+
+方案：
+
+1. `vel_sum()` 使用 `reduction(+:vsum)`。
+2. `rescale()` 中晶胞修改和 `setup_cell_after_vc()` 保持串行，只并行其后的 `vel[i][sd] *= dilation[sd]`。
+3. `propagate_vel()` 在 rank 0 内对原子并行，`const_C`、`const_D`、`expd` 都是线程局部变量，结束后保留原 MPI 广播。
+
+风险：
+
+- MSST 依赖 `vsum`、体积、应力和速度传播顺序；只改每原子独立部分。
+- 先跑 `MODULE_MD_msst`，再跑短 MD 案例比较 `MD_dump`。
+
+### 5.12 P2：LJ 势作为参考 benchmark 的可选并行化
+
+位置：`source/source_esolver/esolver_lj.cpp:58-159`
+
+LJ 不属于机器学习势，但适合作为不依赖外部模型的 benchmark。当前 `runner()` 中按原子和邻居遍历，累加 `lj_potential` 和 `lj_virial`，并写 `lj_force(index, *)`。
+
+可行方案：
+
+1. 建立扁平原子索引，避免 `index++` 串行依赖。
+2. 每个原子 `index` 的 `lj_force(index, *)` 只由该原子邻居循环写入，可并行。
+3. `lj_potential` 用 reduction。
+4. `lj_virial` 使用 9 个局部 reduction 标量，替代共享 `LJ_virial()` 直接累加。
+
+暂不作为主线原因：
+
+- `NeighborSearch` 构建本身可能是主要耗时之一；
+- 力是否依赖完整双计数邻居表需要测试确认；
+- 该项与机器学习势函数目标相关性弱于 NEP/DPMD 接口层。
+
+---
+
+## 六、暂缓或不建议并行的点
+
+1. `MD_func::gaussrand()`：使用静态变量 `v1`、`v2`、`S`、`phase`，不是线程安全函数。
+2. `MD_func::rand_vel()`：初始化速度需要可重复随机序列、总动量归零和温度重标定，直接并行会改变结果。
+3. `Verlet::apply_thermostat()` Anderson 分支：随机碰撞选择和高斯随机数均依赖串行 RNG。
+4. `Langevin::post_force()`：每原子随机力调用 `std::rand()`，直接并行有数据竞争和序列改变。
+5. `Run_MD::md_line()` 时间步循环：MD 时间推进具有严格前后依赖，不能并行时间步。
+6. `unitcell::setup_cell_after_vc()`、`unitcell::update_pos_taud()`：涉及 `UnitCell` 结构更新，先保持串行。
+7. DPMD/NEP 核心推理：位于外部库，本阶段只优化 ABACUS 接口层。
+
+---
+
+## 七、实现顺序与分支划分
+
+### PR 1：MD 公共循环与统计归约
+
+建议分支：`opt/md-openmp-base`
+
+修改文件：
+
+- `source/source_md/md_base.cpp`
+- `source/source_md/md_func.cpp`
+- `source/source_md/verlet.cpp`
+- `source/source_md/fire.cpp`
+
+内容：
+
+1. 并行 `update_pos()`、`update_vel()`；
+2. 并行 `kinetic_energy()`、`temp_vector()`；
+3. 并行 `force_virial()` 的力回填；
+4. 并行 `Verlet::thermalize()`；
+5. 并行 FIRE 的 max/sum reduction 和速度更新。
+
+验证：
+
+```bash
+cmake --build build -j
+OMP_NUM_THREADS=1 ctest --test-dir build -R 'MODULE_MD_(func|verlet|fire)' --output-on-failure
+OMP_NUM_THREADS=4 ctest --test-dir build -R 'MODULE_MD_(func|verlet|fire)' --output-on-failure
+```
+
+### PR 2：NEP/DPMD 接口层 OpenMP
+
+建议分支：`opt/md-ml-interface-openmp`
+
+修改文件：
+
+- `source/source_esolver/esolver_nep.h`
+- `source/source_esolver/esolver_nep.cpp`
+- `source/source_esolver/esolver_dp.h`
+- `source/source_esolver/esolver_dp.cpp`
+
+内容：
+
+1. 增加扁平原子索引缓存；
+2. 并行 NEP/DPMD 坐标转换；
+3. 并行 NEP 能量和 virial 求和；
+4. 并行 NEP/DPMD 力回填。
+
+验证：
+
+- 编译不带 `__NEP`/`__DPMD` 时仍通过；
+- 有模型文件时运行 1、2、4、8 线程短 MD；
+- 比较总能、最大力差、virial/stress 差。
+
+### PR 3：NHC/MSST 与可选 LJ benchmark
+
+建议分支：`opt/md-openmp-extended`
+
+修改文件：
+
+- `source/source_md/nhchain.cpp`
+- `source/source_md/msst.cpp`
+- 可选：`source/source_esolver/esolver_lj.cpp`
+
+内容：
+
+1. NHC velocity scale 和 barostat velocity update；
+2. MSST velocity sum、rescale、propagate_vel；
+3. 可选 LJ 原子邻居循环并行化。
+
+验证：
+
+```bash
+OMP_NUM_THREADS=1 ctest --test-dir build -R 'MODULE_MD_(nhc|msst|LJ_pot)' --output-on-failure
+OMP_NUM_THREADS=4 ctest --test-dir build -R 'MODULE_MD_(nhc|msst|LJ_pot)' --output-on-failure
+```
+
+---
+
+## 八、测试与 benchmark 方案
+
+### 8.1 单元测试
+
+优先使用现有 MD 单元测试：
+
+- `MODULE_MD_func`
+- `MODULE_MD_verlet`
+- `MODULE_MD_fire`
+- `MODULE_MD_nhc`
+- `MODULE_MD_msst`
+- `MODULE_MD_lgv`
+- `MODULE_MD_LJ_pot`
+
+每个 PR 至少运行：
+
+```bash
+for t in 1 2 4 8; do
+  OMP_NUM_THREADS=$t OMP_PROC_BIND=close OMP_PLACES=cores \
+  ctest --test-dir build -R 'MODULE_MD' --output-on-failure
+done
+```
+
+`MODULE_MD_lgv` 虽然本阶段不并行 Langevin 随机数循环，但仍要跑，确保 P0 公共循环没有破坏 Langevin 流程。
+
+### 8.2 线程一致性检查
+
+建议新增一个轻量脚本，例如 `tools/benchmark/md_openmp_check.sh` 或放在课程报告目录中，自动执行：
+
+1. 使用 `OMP_NUM_THREADS=1` 生成基准输出；
+2. 使用 `OMP_NUM_THREADS=2/4/8` 生成对比输出；
+3. 提取 `#TOTAL ENERGY#`、`Energy (Ry)`、`Temperature (K)`、最大力；
+4. 用容差比较：
+   - 能量相对误差 `<= 1e-10`；
+   - 力最大绝对误差 `<= 1e-10` 到 `1e-8`，按势函数精度调整；
+   - 温度相对误差 `<= 1e-10`；
+5. 对随机数相关测试只检查程序稳定和物理量有限，不要求跨线程 bitwise 相同。
+
+### 8.3 性能 benchmark
+
+建议至少三类规模：
+
+| 规模 | 原子数 | 用途 |
+| --- | --- | --- |
+| small | 100-500 | 检查线程开销，验证阈值不会拖慢小体系 |
+| medium | 2,000-10,000 | 主要性能对比 |
+| large | 20,000+ | 观察内存带宽和并行效率 |
+
+线程数：
+
+- `OMP_NUM_THREADS=1`
+- `2`
+- `4`
+- `8`
+- 机器允许时 `16`
+
+统计指标：
+
+- `MD_base::update_pos`
+- `MD_base::update_vel`
+- `MD_func::force_virial`
+- `ESolver_NEP::runner` 或 `ESolver_DP::runner`
+- 完整 MD 每步平均耗时
+- 加速比 `S_p = T_1 / T_p`
+- 并行效率 `E_p = S_p / p`
+
+ABACUS 已有 `ModuleBase::timer`，第一版可直接利用已有 timer 输出；如果粒度不够，再给 NEP/DPMD 接口层局部循环增加临时 timer，最终提交前移除过细的临时输出。
+
+---
+
+## 九、正确性风险与处理
+
+| 风险 | 影响 | 处理 |
+| --- | --- | --- |
+| 浮点归约顺序变化 | 能量、温度、stress 末位差异 | 测试用容差；报告说明非 bitwise 差异来源 |
+| 小体系线程开销 | 性能变慢 | `if (nat >= 256)` 阈值；benchmark 后调整 |
+| 外部库也使用 OpenMP | 线程过度订阅 | benchmark 记录外部库线程设置；必要时文档建议控制库内线程 |
+| MPI + OpenMP 混合 | rank 0 线程计算后广播 | 不改变广播位置；运行 MPI 单测 |
+| 随机数线程安全 | 数据竞争、随机序列改变 | 本阶段不并行随机数循环 |
+| 可变胞更新顺序 | NPT/MSST 结果漂移 | 只并行原子独立循环，不并行晶胞更新函数 |
+
+---
+
+## 十、预期收益
+
+P0 的积分器和统计优化属于“每步必经”的低风险收益。对于 ML 势主导的大体系，总耗时加速可能不如势函数核心明显，但可以减少 ABACUS 接口层和后处理的串行尾部，提升多线程下的整体伸缩。
+
+P1 的 NEP/DPMD 接口层优化更贴合机器学习分子动力学目标。NEP 中能量、力、virial 后处理均与原子数线性相关，且每步调用一次；大体系下预期比单纯积分器循环更有可见收益。
+
+P2 的 NHC/MSST/LJ 优化用于扩展覆盖面和 benchmark 支撑。建议在 P0/P1 稳定后再做，避免一次性改动过多导致数值误差难定位。
+
+---
+
+## 十一、最终交付物建议
+
+1. 代码改动 PR：
+   - `opt/md-openmp-base`
+   - `opt/md-ml-interface-openmp`
+   - `opt/md-openmp-extended`
+2. 测试脚本：
+   - 多线程单测脚本；
+   - 短 MD 输出一致性检查脚本；
+   - benchmark 数据收集脚本。
+3. 报告数据：
+   - 每个优化点的改动说明；
+   - 1/2/4/8 线程耗时表；
+   - 加速比和并行效率；
+   - 正确性误差表；
+   - 暂缓随机数和 CUDA 的理由。
+
+本方案的主线应先完成 P0 和 NEP/DPMD P1。若时间有限，至少交付 `MD_base`、`MD_func`、`Verlet/FIRE`、`ESolver_NEP` 四块；这些点覆盖 MD 主循环、统计归约和机器学习势接口层，且最符合“容易实现、提效显著、OpenMP 优先”的要求。
diff --git a/Planners/0531/openmp_nep_basic_changes.md b/Planners/0531/openmp_nep_basic_changes.md
deleted file mode 100644
index 81885adce20..00000000000
--- a/Planners/0531/openmp_nep_basic_changes.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# OpenMP 基本部分与 NEP 优化修改说明
-
-日期：2026-05-31
-
-本次根据 `Planners/0531/md_parallel_optimization_plan.md`，只实现了收益较明确、数据依赖简单的 OpenMP 优化点；暂未处理随机数相关循环、NHC/MSST 扩展项、DPMD 和 LJ。
-
-## 修改范围
-
-### 基本 MD 循环
-
-- `source/source_md/md_base.cpp`
-  - `MD_base::update_pos()`：对 rank 0 上的逐原子位置增量计算加入 `#pragma omp parallel for schedule(static)`。
-  - `MD_base::update_vel()`：对 rank 0 上的逐原子速度半步更新加入 OpenMP 并行。
-  - MPI 广播和 `unitcell::update_pos_taud()` 顺序保持不变。
-
-- `source/source_md/md_func.cpp`
-  - `kinetic_energy()`：使用 OpenMP `reduction(+:ke)` 并行动能求和。
-  - `force_virial()`：并行 `force_temp` 到 `force` 的逐原子回填。
-  - `temp_vector()`：用 9 个标量 reduction 并行温度张量累加，并显式写回 3x3 矩阵，避免依赖 `matrix::create()` 是否清零。
-
-### NEP 接口层
-
-- `source/source_esolver/esolver_nep.h`
-  - 新增 `atom_type_index` 和 `atom_local_index` 缓存，用于把全局原子序号映射到 `UnitCell` 的元素类型和类型内局部序号。
-
-- `source/source_esolver/esolver_nep.cpp`
-  - `before_all_runners()`：初始化全局原子到 `UnitCell` 存储位置的索引缓存。
-  - `runner()`：
-    - 并行 NEP 坐标缓冲区填充；
-    - 使用 OpenMP reduction 并行每原子能量求和；
-    - 并行 NEP 力回填和单位转换；
-    - 使用线程局部 9 分量数组并行 per-atom virial 求和。
-
-## 并行策略
-
-- 所有新增循环使用 `schedule(static)`。
-- 对短循环使用 `if (nat >= 256)` 或同类阈值，降低小体系线程启动开销。
-- 不改变 MD 时间步顺序、MPI 广播位置、NEP 外部库调用边界。
-- 浮点归约可能带来末位差异，后续测试应使用容差比较。
-
-## 未执行项
-
-- 按要求未运行测试。
-- 未并行 `rand_vel()`、Anderson thermostat、Langevin 随机力等随机数相关循环。
-- 未改 DPMD、NHC/MSST 扩展优化、LJ benchmark 优化。
-
-## 回溯方式
-
-本次修改已按 git commit 分开记录：
-
-- 代码优化提交：`optimize: add OpenMP to MD base loops and NEP interface`
-- 文档提交：`docs: record OpenMP NEP and MD base changes`
-
-如需回滚，可对对应提交执行 `git revert <commit>`。
diff --git a/Planners/06_md.md b/Planners/06_md.md
new file mode 100644
index 00000000000..c1d410a3b83
--- /dev/null
+++ b/Planners/06_md.md
@@ -0,0 +1,960 @@
+# 	机器学习分子动力学的并行优化
+
+## 大作业说明
+
+---
+
+## 一、背景介绍
+
+### 0.1 分子动力学基础
+
+#### 0.1.1 什么是分子动力学（MD）？
+
+**分子动力学**是一种基于经典力学的计算机模拟方法，用于研究原子和分子的运动。
+
+**核心思想**：
+
+- 通过数值求解牛顿运动方程，模拟原子和分子的运动轨迹
+- 计算体系的能量、温度、压力等宏观性质
+- 研究材料的结构、动力学和热力学性质
+
+**ABACUS中的MD功能**：
+
+- 支持多种积分器（Verlet、Velocity-Verlet等）
+- 支持多种温度和压力控制方法
+- 支持周期性边界条件
+- 可以与第一性原理计算结合，实现AIMD（从头算分子动力学）
+
+#### 0.1.2 机器学习势函数
+
+传统的分子动力学模拟中，原子间相互作用通常使用经验势函数描述。随着机器学习的发展，基于数据驱动的机器学习势函数逐渐成为研究热点。
+
+**常用的机器学习势函数**：
+
+1. **DPMD（Deep Potential Molecular Dynamics）**
+   - 基于深度学习的势函数模型
+   - 由深度神经网络表示原子间相互作用
+   - 可以达到接近第一性原理的精度
+
+2. **NEP（Neuroevolution Potential，神经演化势）**
+   - 由 Fan Zheyong 团队于 2021 年提出
+   - 专为大规模、高效分子动力学模拟设计
+   - 核心优势：逼近 DFT 精度 + 极快计算速度 + GPU 原生加速
+
+**机器学习势函数的优势**：
+
+- 精度接近第一性原理计算
+- 计算速度比第一性原理计算快几个数量级
+- 可以处理复杂的原子间相互作用
+
+#### 0.1.3 分子动力学中的并行计算
+
+分子动力学模拟的计算量与原子数N成正比，对于大规模系统（N > 10^4），串行计算往往无法在合理时间内完成。因此，并行计算成为分子动力学模拟的关键技术。
+
+**并行计算的层次**：
+
+1. **MPI（Message Passing Interface）**：进程级并行，适用于分布式内存系统
+2. **OpenMP（Open Multi-Processing）**：线程级并行，适用于共享内存系统
+3. **CUDA**：GPU加速，适用于大规模并行计算
+
+**ABACUS中的并行策略**：
+
+- 原子并行：将原子分配到不同进程/线程
+- 时间步并行：使用预测-校正方法并行计算不同时间步
+- 势函数计算并行：并行计算原子间相互作用
+
+---
+
+### 1.1 问题由来
+
+在ABACUS的分子动力学模块中，随着体系规模的增大，计算效率成为制约模拟规模和时间的主要因素。具体表现为：
+
+1. **计算瓶颈**：
+   - 势函数计算（特别是机器学习势函数）成为主要计算瓶颈
+   - 随着原子数增加，计算时间呈非线性增长
+   - 传统的串行计算无法处理大规模系统
+
+2. **并行效率问题**：
+   - 现有并行实现对机器学习势函数的支持不足
+   - 不同并行层次（MPI/OpenMP/CUDA）的协同优化不够
+   - 数据通信开销较大，特别是在GPU加速时
+
+3. **扩展性问题**：
+   - 随着进程/线程数增加，并行效率下降
+   - 内存使用效率低，限制了可模拟的体系规模
+
+4. **框架设计问题**：
+   - 现有代码结构对机器学习势函数的集成不够灵活
+   - 缺乏统一的并行接口和抽象
+   - 单元测试覆盖不足
+
+### 1.2 现有代码结构
+
+#### 1.2.1 分子动力学核心模块
+
+ABACUS的分子动力学功能主要由以下模块组成：
+
+```
+source/source_md/
+├── md.cpp                # 分子动力学主入口
+├── md.h                  # 分子动力学类定义
+├── integrator/           # 积分器实现
+│   ├── verlet.cpp        # Verlet积分器
+│   ├── velocity_verlet.cpp # Velocity-Verlet积分器
+│   └── ...
+├── thermostat/           # 温度控制器
+│   ├── nose_hoover.cpp   # Nose-Hoover热浴
+│   ├── langevin.cpp      # Langevin热浴
+│   └── ...
+├── barostat/             # 压力控制器
+│   ├── berendsen.cpp     # Berendsen压浴
+│   ├── andersen.cpp      # Andersen压浴
+│   └── ...
+└── potential/            # 势函数实现
+    ├── classical/        # 经典势函数
+    ├── ml/               # 机器学习势函数
+    │   ├── dpmd.cpp      # DPMD势函数
+    │   ├── nep.cpp       # NEP势函数
+    │   └── ...
+    └── ...
+```
+
+#### 1.2.2 分子动力学主类
+
+```cpp
+// source/source_md/md.h
+
+class MD
+{
+public:
+    // 积分器
+    Integrator* integrator = nullptr;
+    // 温度控制器
+    Thermostat* thermostat = nullptr;
+    // 压力控制器
+    Barostat* barostat = nullptr;
+    // 势函数
+    Potential* potential = nullptr;
+
+    // 模拟参数
+    int nstep = 0;          // 总步数
+    double dt = 0.0;        // 时间步长
+    double temperature = 0.0; // 目标温度
+    double pressure = 0.0;   // 目标压力
+
+    // 原子数据
+    int natom = 0;          // 原子数
+    double** pos = nullptr;  // 原子位置
+    double** vel = nullptr;  // 原子速度
+    double** force = nullptr; // 原子受力
+    double* mass = nullptr;  // 原子质量
+
+    // 并行相关
+    int nproc = 1;          // 进程数
+    int myrank = 0;         // 当前进程rank
+    MPI_Comm comm = MPI_COMM_WORLD;
+};
+```
+
+#### 1.2.3 势函数接口
+
+```cpp
+// source/source_md/potential/potential.h
+
+class Potential
+{
+public:
+    virtual ~Potential() = default;
+
+    // 计算能量和力
+    virtual double calculate_energy_and_force(
+        const int natom,
+        const double** pos,
+        double** force) = 0;
+
+    // 初始化势函数
+    virtual void initialize() = 0;
+
+    // 并行设置
+    virtual void set_parallel(int nproc, int myrank, MPI_Comm comm) = 0;
+};
+
+// 机器学习势函数基类
+class MLPotential : public Potential
+{
+public:
+    // 加载模型
+    virtual void load_model(const std::string& model_path) = 0;
+
+    // 设置并行计算参数
+    virtual void set_parallel_params(int nthread, bool use_gpu) = 0;
+};
+```
+
+#### 1.2.4 现有并行实现
+
+目前ABACUS的分子动力学模块的并行实现主要基于MPI：
+
+```cpp
+// source/source_md/md.cpp:247-278
+void MD::run()
+{
+    // 原子分配
+    int natom_local = natom / nproc;
+    int natom_remainder = natom % nproc;
+    int start_atom = myrank * natom_local + std::min(myrank, natom_remainder);
+    int end_atom = start_atom + natom_local + (myrank < natom_remainder ? 1 : 0);
+
+    // 时间步循环
+    for (int istep = 0; istep < nstep; ++istep)
+    {
+        // 计算力
+        double energy = potential->calculate_energy_and_force(
+            natom, pos, force);
+
+        // 积分器更新
+        integrator->update(natom, pos, vel, force, mass, dt);
+
+        // 温度控制
+        thermostat->control_temperature(natom, vel, mass, temperature, dt);
+
+        // 压力控制
+        barostat->control_pressure(natom, pos, vel, force, mass, pressure, dt);
+
+        // 输出
+        if (myrank == 0 && istep % output_freq == 0)
+        {
+            // 输出结果
+        }
+    }
+}
+```
+
+**现有并行实现的问题**：
+
+1. **原子分配简单**：采用简单的均匀分配，没有考虑原子间相互作用的局部性
+2. **通信开销大**：每次计算力时需要全局通信
+3. **并行层次单一**：主要依赖MPI，没有充分利用OpenMP和GPU
+4. **机器学习势函数支持不足**：现有并行实现对DPMD和NEP等机器学习势函数的优化不够
+
+### 1.3 机器学习势函数的并行挑战
+
+#### 1.3.1 DPMD势函数
+
+DPMD（Deep Potential Molecular Dynamics）是一种基于深度学习的势函数，其并行计算面临以下挑战：
+
+1. **计算特性**：
+   - 神经网络前向传播计算量较大
+   - 需要计算原子环境特征（如原子密度）
+   - 计算复杂度与原子数和环境半径相关
+
+2. **并行挑战**：
+   - 原子环境计算的局部性
+   - 神经网络计算的并行化
+   - GPU加速的内存管理
+
+#### 1.3.2 NEP势函数
+
+NEP（Neural Equilibrium Potential）是一种专为凝聚态物理系统设计的神经网络势函数，其并行计算面临以下挑战：
+
+1. **计算特性**：
+   - 基于原子密度的表示
+   - 支持多元素系统
+   - 计算效率高，适合大规模系统
+
+2. **并行挑战**：
+   - 密度计算的并行化
+   - 多元素系统的处理
+   - 内存访问模式的优化
+
+---
+
+## 二、建议可以做的事情（共 8 题）
+
+### 题目 1：原子并行的负载均衡优化
+
+**难度**：⭐⭐
+
+#### 题目描述
+
+优化ABACUS分子动力学模块中的原子并行策略，实现负载均衡的原子分配。
+
+#### 现有代码位置
+
+- `source/source_md/md.cpp:247-278` - 主循环
+- `source/source_md/md.cpp:143-186` - 原子分配
+
+#### 具体要求
+
+1. **负载分析**
+   - 分析不同原子的计算负载
+   - 考虑原子环境的复杂度
+   - 评估现有分配策略的负载均衡性
+
+2. **负载均衡实现**
+   - 实现基于负载的动态原子分配
+   - 考虑空间局部性，减少通信开销
+   - 支持自适应调整
+
+3. **性能测试**
+   - 测试不同体系规模下的负载均衡效果
+   - 对比优化前后的并行效率
+   - 分析加速比
+
+4. **代码质量**
+   - 遵循项目代码规范
+   - 添加必要的注释
+   - 确保与现有代码兼容
+
+5. **单元测试要求**
+   - 编写单元测试验证负载均衡的正确性
+   - 测试不同分配策略的效果
+   - 验证原子分配的边界情况
+
+6. **代码重构（加分项）**
+   - 将原子分配策略抽象为独立的接口
+   - 实现多种分配策略（均匀分配、负载均衡分配等）
+   - 使用策略模式支持不同分配算法
+
+---
+
+### 题目 2：OpenMP 多线程加速势函数计算
+
+**难度**：⭐⭐
+
+#### 题目描述
+
+优化势函数计算的OpenMP并行实现，提高计算效率。
+
+#### 现有代码位置
+
+- `source/source_md/potential/ml/dpmd.cpp` - DPMD势函数
+- `source/source_md/potential/ml/nep.cpp` - NEP势函数
+
+#### 具体要求
+
+1. **并行化分析**
+   - 分析势函数计算中的并行机会
+   - 识别计算热点
+   - 评估数据依赖关系
+
+2. **OpenMP实现**
+   - 使用`#pragma omp parallel for`实现并行计算
+   - 处理线程私有变量和归约操作
+   - 优化内存访问模式
+
+3. **性能测试**
+   - 测试不同线程数下的加速比
+   - 分析并行效率
+   - 对比优化前后的性能
+
+4. **正确性验证**
+   - 确保并行计算结果与串行一致
+   - 处理边界情况
+
+5. **单元测试要求**
+   - 编写单元测试验证OpenMP并行的正确性
+   - 测试不同线程数下的结果一致性
+   - 验证线程安全
+
+6. **代码重构（加分项）**
+   - 将并行计算逻辑抽象为独立的并行计算管理器
+   - 实现线程池管理
+   - 支持动态线程数调整
+
+---
+
+### 题目 3：CUDA 加速机器学习势函数
+
+**难度**：⭐⭐⭐
+
+#### 题目描述
+
+实现机器学习势函数（DPMD和NEP）的GPU加速，利用CUDA提高计算效率。
+
+#### 现有代码位置
+
+- `source/source_md/potential/ml/dpmd.cpp` - DPMD势函数
+- `source/source_md/potential/ml/nep.cpp` - NEP势函数
+
+#### 具体要求
+
+1. **GPU加速分析**
+   - 分析势函数计算的GPU加速可行性
+   - 识别适合GPU加速的计算部分
+   - 评估内存传输开销
+
+2. **CUDA实现**
+   - 实现GPU版本的势函数计算
+   - 优化内存访问模式
+   - 使用CUDA流实现计算与数据传输重叠
+
+3. **性能测试**
+   - 对比CPU和GPU版本的性能
+   - 分析不同体系规模下的加速比
+   - 评估内存传输开销
+
+4. **兼容性**
+   - 保持与现有代码的接口兼容
+   - 支持CPU/GPU自动切换
+
+5. **单元测试要求**
+   - 编写单元测试验证GPU计算的正确性
+   - 对比CPU和GPU版本的结果一致性
+   - 测试不同GPU设备的兼容性
+
+6. **代码重构（加分项）**
+   - 将计算设备抽象为独立的接口
+   - 实现设备选择策略
+   - 支持多GPU并行
+
+---
+
+### 题目 4：MPI-OpenMP-CUDA 混合并行架构
+
+**难度**：⭐⭐⭐⭐
+
+#### 题目描述
+
+设计并实现MPI-OpenMP-CUDA混合并行架构，充分利用不同层次的并行性。
+
+#### 题目背景
+
+单一层次的并行往往无法充分利用现代计算硬件的性能。混合并行架构可以同时利用分布式内存（MPI）、共享内存（OpenMP）和GPU加速，实现更高的并行效率。
+
+#### 具体要求
+
+1. **架构设计**
+   - 设计三层并行架构：MPI进程级、OpenMP线程级、CUDA GPU级
+   - 定义各层次的职责和通信方式
+   - 优化数据分配和通信策略
+
+2. **实现方案**
+   - 修改MD类支持混合并行
+   - 实现不同层次的并行初始化和管理
+   - 优化数据传输和同步
+
+3. **性能测试**
+   - 测试不同并行配置下的性能
+   - 分析各层次并行的贡献
+   - 寻找最优并行配置
+
+4. **兼容性**
+   - 保持与现有代码的接口兼容
+   - 支持不同并行层次的组合
+
+5. **单元测试要求**
+   - 编写单元测试验证混合并行的正确性
+   - 测试不同并行配置的结果一致性
+   - 验证边界情况
+
+6. **代码重构（加分项）**
+   - 将并行架构抽象为独立的并行管理器
+   - 实现并行策略自动选择
+   - 支持运行时并行配置调整
+
+---
+
+### 题目 5：DPMD 势函数的并行优化
+
+**难度**：⭐⭐⭐
+
+#### 题目描述
+
+针对DPMD（Deep Potential Molecular Dynamics）势函数，实现高效的并行计算。
+
+#### 现有代码位置
+
+- `source/source_md/potential/ml/dpmd.cpp` - DPMD势函数实现
+
+#### 具体要求
+
+1. **DPMD计算分析**
+   - 分析DPMD势函数的计算流程
+   - 识别计算热点
+   - 评估并行化潜力
+
+2. **并行实现**
+   - 优化原子环境计算的并行性
+   - 实现神经网络前向传播的并行计算
+   - 优化内存访问模式
+
+3. **性能测试**
+   - 对比优化前后的性能
+   - 测试不同体系规模下的加速比
+   - 分析并行效率
+
+4. **正确性验证**
+   - 确保并行计算结果与串行一致
+   - 处理边界情况
+
+5. **单元测试要求**
+   - 编写单元测试验证DPMD并行计算的正确性
+   - 测试不同并行配置的结果一致性
+   - 验证模型加载和计算的正确性
+
+6. **代码重构（加分项）**
+   - 将DPMD计算抽象为独立的计算单元
+   - 实现批处理计算提高GPU利用率
+   - 支持模型参数的自动优化
+
+---
+
+### 题目 6：NEP 势函数的并行优化
+
+**难度**：⭐⭐⭐
+
+#### 题目描述
+
+针对NEP（Neural Equilibrium Potential）势函数，实现高效的并行计算。
+
+#### 现有代码位置
+
+- `source/source_md/potential/ml/nep.cpp` - NEP势函数实现
+
+#### 具体要求
+
+1. **NEP计算分析**
+   - 分析NEP势函数的计算流程
+   - 识别计算热点
+   - 评估并行化潜力
+
+2. **并行实现**
+   - 优化原子密度计算的并行性
+   - 实现多元素系统的并行处理
+   - 优化内存访问模式
+
+3. **性能测试**
+   - 对比优化前后的性能
+   - 测试不同体系规模下的加速比
+   - 分析并行效率
+
+4. **正确性验证**
+   - 确保并行计算结果与串行一致
+   - 处理边界情况
+
+5. **单元测试要求**
+   - 编写单元测试验证NEP并行计算的正确性
+   - 测试不同并行配置的结果一致性
+   - 验证多元素系统的计算正确性
+
+6. **代码重构（加分项）**
+   - 将NEP计算抽象为独立的计算单元
+   - 实现密度计算的GPU加速
+   - 支持自适应密度截断半径
+
+---
+
+### 题目 7：分子动力学积分器的并行优化
+
+**难度**：⭐⭐
+
+#### 题目描述
+
+优化分子动力学积分器的并行实现，提高计算效率。
+
+#### 现有代码位置
+
+- `source/source_md/integrator/verlet.cpp` - Verlet积分器
+- `source/source_md/integrator/velocity_verlet.cpp` - Velocity-Verlet积分器
+
+#### 具体要求
+
+1. **积分器分析**
+   - 分析积分器的计算流程
+   - 识别并行机会
+   - 评估数据依赖关系
+
+2. **并行实现**
+   - 实现积分器的OpenMP并行
+   - 优化内存访问模式
+   - 处理边界情况
+
+3. **性能测试**
+   - 对比优化前后的性能
+   - 测试不同积分器的并行效率
+   - 分析加速比
+
+4. **正确性验证**
+   - 确保并行计算结果与串行一致
+   - 验证能量守恒
+
+5. **单元测试要求**
+   - 编写单元测试验证积分器并行计算的正确性
+   - 测试不同时间步长的结果一致性
+   - 验证能量守恒和动量守恒
+
+6. **代码重构（加分项）**
+   - 将积分器抽象为独立的接口
+   - 实现多种积分器的并行版本
+   - 支持自适应时间步长
+
+---
+
+### 题目 8：分子动力学单元测试框架
+
+**难度**：⭐⭐
+
+#### 题目描述
+
+设计并实现分子动力学模块的单元测试框架，确保代码质量和功能正确性。
+
+#### 题目背景
+
+现有的分子动力学模块缺乏全面的单元测试，这使得代码修改和优化存在风险。建立一个完善的单元测试框架对于保证代码质量至关重要。
+
+#### 具体要求
+
+1. **测试框架设计**
+   - 设计适合分子动力学模块的单元测试框架
+   - 定义测试用例和测试方法
+   - 实现测试结果的自动验证
+
+2. **测试用例实现**
+   - 编写势函数计算的测试用例
+   - 编写积分器的测试用例
+   - 编写并行计算的测试用例
+
+3. **测试覆盖**
+   - 确保关键功能的测试覆盖
+   - 测试边界情况和异常处理
+   - 验证不同并行配置的正确性
+
+4. **性能测试**
+   - 实现性能基准测试
+   - 监控优化效果
+   - 提供性能分析工具
+
+5. **集成与自动化**
+   - 集成到CI/CD流程
+   - 实现测试的自动化运行
+   - 提供测试报告生成
+
+6. **代码重构（加分项）**
+   - 将测试框架抽象为独立的模块
+   - 实现测试数据的自动生成
+   - 支持测试结果的可视化
+
+---
+
+## 三、开放性题目（重点）
+
+### 开放题目 1：机器学习分子动力学的统一并行框架 ⭐⭐⭐⭐
+
+**建议分值**：20-40 分
+
+#### 题目描述
+
+设计并实现一个统一的并行框架，支持不同类型的机器学习势函数（DPMD、NEP等）的高效并行计算。
+
+#### 设计目标
+
+1. **统一接口**
+   - 为不同机器学习势函数提供统一的并行接口
+   - 支持动态切换不同势函数
+   - 提供一致的并行配置方式
+
+2. **多级并行支持**
+   - 支持MPI、OpenMP、CUDA混合并行
+   - 自动选择最优并行策略
+   - 支持异构计算环境
+
+3. **性能优化**
+   - 实现计算与通信的重叠
+   - 优化内存访问模式
+   - 支持批处理计算
+
+4. **可扩展性**
+   - 易于添加新的机器学习势函数
+   - 支持未来硬件架构
+   - 提供插件机制
+
+#### 参考架构
+
+```
+MLMDParallelFramework
+├── ParallelManager (抽象基类)
+│   ├── MPIManager          // MPI并行管理
+│   ├── OpenMPManager       // OpenMP并行管理
+│   ├── CUDAManager         // CUDA并行管理
+│   └── HybridManager       // 混合并行管理
+│
+├── PotentialEvaluator (抽象基类)
+│   ├── DPMDEvaluator       // DPMD势函数评估
+│   ├── NEPEvaluator        // NEP势函数评估
+│   └── ...
+│
+├── DataManager            // 数据管理
+├── PerformanceMonitor     // 性能监控
+└── AutoTuner              // 自动调优
+```
+
+#### 重点关注
+
+| 完成度   | 分值                                | 要求 |
+| -------- | ----------------------------------- | ---- |
+| 基础框架 | 实现统一接口和至少 2 种并行管理器   |      |
+| 完整实现 | 实现 4 种以上并行策略，包含自动选择 |      |
+| GPU 支持 | 实现 GPU 加速的数据处理             |      |
+| 性能达标 | 实测性能提升 3 倍以上               |      |
+| 优秀实现 | 性能提升 5 倍以上，代码质量优秀     |      |
+
+---
+
+### 开放题目 2：分子动力学与第一性原理计算的无缝集成 ⭐⭐⭐⭐
+
+#### 题目描述
+
+设计并实现分子动力学与第一性原理计算的无缝集成，支持混合模拟（如AIMD与机器学习势函数的结合）。
+
+#### 研究问题
+
+1. **混合模拟策略**
+   - 设计AIMD与机器学习势函数的混合模拟方案
+   - 实现不同势函数之间的平滑切换
+   - 优化计算资源分配
+
+2. **并行优化**
+   - 实现混合模拟的并行计算
+   - 优化第一性原理计算与分子动力学的协同
+   - 减少不同计算模式切换的开销
+
+3. **验证与测试**
+   - 验证混合模拟的正确性
+   - 测试不同混合策略的性能
+   - 分析精度与效率的权衡
+
+#### 参考研究方向
+
+```cpp
+// 混合模拟示例
+class HybridMD {
+private:
+    // 第一性原理计算引擎
+    FirstPrinciplesEngine* fp_engine;
+    // 机器学习势函数
+    MLPotential* ml_potential;
+    // 切换策略
+    SwitchingStrategy* switch_strategy;
+
+public:
+    // 运行混合模拟
+    void run() {
+        for (int istep = 0; istep < nstep; ++istep) {
+            // 根据切换策略选择计算引擎
+            if (switch_strategy->should_use_fp(istep)) {
+                // 使用第一性原理计算
+                fp_engine->calculate_energy_and_force(...);
+                // 更新机器学习势函数模型
+                ml_potential->update_model(...);
+            } else {
+                // 使用机器学习势函数
+                ml_potential->calculate_energy_and_force(...);
+            }
+            // 积分器更新
+            integrator->update(...);
+        }
+    }
+};
+```
+
+---
+
+## 四、测试环境与基准数据
+
+### 4.1 推荐测试体系
+
+| 体系   | 原子数 | 类型     | 推荐测试规模 |
+| ------ | ------ | -------- | ------------ |
+| 水     | 64     | 分子     | 初级测试     |
+| 硅     | 128    | 晶体     | 基准测试     |
+| 铝     | 256    | 金属     | 性能测试     |
+| 蛋白质 | 512    | 生物分子 | 大规模测试   |
+
+### 4.2 性能基准
+
+| 优化项         | 当前时间 | 目标时间 | 最低加速比 |
+| -------------- | -------- | -------- | ---------- |
+| 原子并行       | T₁       | T₁/4     | 4x (4进程) |
+| OpenMP 并行    | T₂       | T₂/4     | 4x (4线程) |
+| CUDA 加速      | T₃       | T₃/10    | 10x        |
+| 混合并行       | T₄       | T₄/20    | 20x        |
+| 机器学习势函数 | T₅       | T₅/5     | 5x         |
+
+### 4.3 测试脚本参考
+
+```bash
+#!/bin/bash
+# benchmark_md.sh - 分子动力学性能测试
+
+export OMP_NUM_THREADS=8
+export MKL_NUM_THREADS=8
+
+for nproc in 1 2 4 8 16; do
+    for nthread in 1 2 4 8; do
+        echo "Testing: nproc=$nproc, nthread=$nthread"
+        export OMP_NUM_THREADS=$nthread
+        mpirun -np $nproc ./abacus INPUT > log_p${nproc}_t${nthread}.out 2>&1
+        grep "MD step" log_p${nproc}_t${nthread}.out | tail -1
+    done
+done
+
+# GPU测试
+if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
+    echo "Testing with GPU"
+    mpirun -np 1 ./abacus INPUT_gpu > log_gpu.out 2>&1
+    grep "MD step" log_gpu.out | tail -1
+fi
+```
+
+---
+
+## 五、代码规范与提交流程
+
+### 5.1 代码规范
+
+1. **命名规范**
+   - 遵循项目现有的命名风格
+   - 新增函数需添加文档注释
+
+2. **模块化设计**
+   - 独立功能封装为独立函数/类
+   - 便于单元测试
+
+3. **错误处理**
+   - 检查所有 MPI 调用返回值
+   - 妥善处理异常情况
+
+4. **并行代码规范**
+   - 明确并行区域和同步点
+   - 避免死锁和竞争条件
+   - 注释并行策略和通信模式
+
+### 5.2 提交流程
+
+#### 5.2.1 推荐方式：GitHub Pull Request ⭐
+
+为了更好地模拟真实软件开发流程，我们**强烈推荐**使用 GitHub 进行代码提交和协作。具体方式如下：
+
+1. **Fork 仓库**
+
+   - Fork ABACUS deepmodeling仓库到你自己的 GitHub 账户
+   - 地址：`https://github.com/deepmodeling/abacus-develop`
+
+2. **创建分支**
+
+   ```bash
+   git checkout -b feature/ml-md-parallel-optimization
+   ```
+
+3. **少量多次提交**
+
+   ```bash
+   # 每次完成一个小功能就提交
+   git add source/source_md/
+   git commit -m "Add OpenMP parallelization for MD potential calculation"
+   git push origin feature/ml-md-parallel-optimization
+   ```
+
+4. **提交 Pull Request**
+
+   - 在 GitHub 上创建 Pull Request
+   - 描述你做了哪些优化
+   - 请求代码 Review
+
+#### 5.2.2 提交策略
+
+| 原则         | 说明                                           |
+| ------------ | ---------------------------------------------- |
+| **少量多次** | 每完成一个小功能就提交，不要等到最后一次性提交 |
+| **问题导向** | 每个 PR 解决一个具体问题                       |
+| **文档完善** | PR 描述中说明解决了什么瓶颈、预期性能提升      |
+| **可验证**   | 提交时附带测试结果或性能数据                   |
+
+#### 5.2.3 代码接受标准
+
+**你的代码被官方仓库接受将获得额外加分**：
+
+| 🌟 代码被 merged | PR 被接受并合并到主分支 |
+| 🌟 代码可运行 | 通过基本编译和测试 |
+
+#### 5.2.4 评分原则
+
+> **核心原则：以实际解决问题的质量和数量作为评价标准**
+
+- 代码不被接受也可以获得分数，取决于工作量和完成质量
+- 重点关注：是否真正解决了实际问题、是否有创新性、代码是否健壮
+- 不以"是否被接受"作为唯一标准
+
+---
+
+### 5.3 报告格式要求
+
+```latex
+\documentclass[12pt,a4paper]{article}
+
+\title{机器学习分子动力学并行优化}
+\author{姓名}
+\date{\today}
+
+\begin{document}
+\maketitle
+
+\section{引言}
+% 描述问题背景和优化目标
+
+\section{现有代码分析}
+% 分析当前实现的瓶颈
+
+\section{优化方案}
+% 描述实现的优化方法
+
+\section{性能测试}
+% 包含测试结果和图表
+
+\section{结论}
+% 总结优化效果和心得
+
+\end{document}
+```
+
+---
+
+## 六、参考资料
+
+### 6.1 代码位置索引
+
+| 文件        | 路径                                     | 说明               |
+| ----------- | ---------------------------------------- | ------------------ |
+| MD 主类     | `source/source_md/md.h`                  | 分子动力学主类定义 |
+| MD 实现     | `source/source_md/md.cpp`                | 分子动力学实现     |
+| 势函数接口  | `source/source_md/potential/potential.h` | 势函数基类         |
+| DPMD 势函数 | `source/source_md/potential/ml/dpmd.cpp` | DPMD势函数实现     |
+| NEP 势函数  | `source/source_md/potential/ml/nep.cpp`  | NEP势函数实现      |
+| 积分器      | `source/source_md/integrator/`           | 积分器实现         |
+
+### 6.2 推荐阅读
+
+1. **分子动力学**：《Molecular Dynamics: Theory and Practice》- D. C. Rapaport
+2. **并行计算**：《Parallel Programming with MPI》- P. S. Pacheco
+3. **机器学习势函数**：
+   - DPMD: "Deep Potential Molecular Dynamics: a scalable model with the accuracy of quantum mechanics" - Zhang et al.
+   - NEP: "Neural Equilibrium Propagation for Molecular Dynamics" - Wang et al.
+4. **CUDA编程**：《Professional CUDA C Programming》- J. Cheng et al.
+
+---
+
+## 七、致谢
+
+本大作业题目设计参考了以下资源：
+
+1. ABACUS 软件源代码 (https://github.com/abacusmodeling/abacus-develop)
+2. DeepMD-kit 项目 (https://github.com/deepmodeling/deepmd-kit)
+3. NEP 势函数实现 (https://github.com/brucefan1983/NEP)
+4. 《Computational Molecular Dynamics: Challenges, Methods, Ideas》- D. Frenkel
+
+---
+
+**最后更新**：2026-04-21
+
+**版本**：v1.0
\ No newline at end of file
diff --git a/Planners/Project_plan.md b/Planners/Project_plan.md
new file mode 100644
index 00000000000..28da08d2170
--- /dev/null
+++ b/Planners/Project_plan.md
@@ -0,0 +1,388 @@
+# 机器学习分子动力学模块并行优化项目计划书
+
+## 一、项目背景与拟选方向
+
+本项目围绕 ABACUS 分子动力学模块的并行优化展开。分子动力学通过数值求解牛顿运动方程，根据原子位置计算势能和受力，再通过积分器更新原子的位置与速度。当前仓库中 MD 主循环位于 `source/source_md/run_md.cpp`，基础积分算子位于 `source/source_md/md_base.cpp`，并由 `Verlet`、`Nose_Hoover`、`Langevin`、`MSST` 和 `FIRE` 等类实现不同动力学流程；DPMD、NEP 等机器学习势函数通过 `source/source_esolver` 中的 ESolver 接口接入。课程说明中指出，机器学习势函数能够在接近第一性原理精度的同时显著提升计算效率，但势函数接口开销、并行效率、扩展性和测试覆盖仍是需要优化的问题。
+
+本项目拟按以下顺序推进：
+
+1. 完成分子动力学积分器的 OpenMP 并行优化；
+2. 建立与积分器优化配套的正确性测试和性能基准测试；
+3. 完成机器学习势函数 ABACUS 接口层的 OpenMP 并行优化；
+4. 在 DPMD 或 NEP 中选择一个方向进行专项深入优化；
+5. 有余力时尝试机器学习势函数局部计算的 CUDA 加速。
+
+整体路线为：先完成积分器优化与测试闭环，再进入势函数 OpenMP 优化，随后选择 DPMD 或 NEP 深入分析，最后尝试 CUDA 局部加速。
+
+------
+
+## 二、项目目标
+
+本项目目标是完成一条较完整的科学计算软件优化流程：阅读 ABACUS 分子动力学模块代码，理解 MD 主循环、积分器、势函数接口和机器学习势函数接入方式；在此基础上，对积分器原子循环、ABACUS 侧势函数数据转换、力和 Virial 后处理等部分进行并行优化；最后通过测试脚本验证计算正确性，并用 benchmark 数据评价优化效果。
+
+| 目标                   | 主要内容                                                     |
+| ---------------------- | ------------------------------------------------------------ |
+| 积分器并行优化         | 对 `MD_base`/`Verlet` 中按原子更新位置、速度及相关统计量的循环进行 OpenMP 并行 |
+| 测试与性能框架         | 建立正确性测试、线程一致性测试和 benchmark 脚本              |
+| 势函数 OpenMP 优化     | 分析 ABACUS 侧 DPMD 或 NEP 接口中的坐标转换、单位转换和结果后处理循环，完成局部 OpenMP 加速 |
+| 机器学习势函数专项优化 | 选择 DPMD 或 NEP 之一，深入分析 ABACUS 接口层与外部库调用边界上的并行瓶颈 |
+| CUDA 扩展尝试          | 对可独立抽取的局部转换或后处理计算尝试 GPU 加速，并与 CPU/OpenMP 版本对比 |
+
+------
+
+## 三、技术路线
+
+分子动力学的核心流程可以概括为：
+
+当前位置
+→ 计算势能与受力
+→ 积分器更新位置和速度
+→ 温度/压力控制
+→ 输出结果
+→ 进入下一时间步。
+
+其中，原子受力满足：
+
+$$
+\mathbf{F}_i = -\nabla_{\mathbf{r}_i}U.
+$$
+
+以当前 `Verlet` 类采用的 Velocity-Verlet 半步形式为例，速度和位置更新可写为：
+
+$$
+\mathbf{v}_i(t+\Delta t/2)
+=
+\mathbf{v}_i(t)
++
+\frac{\mathbf{F}_i(t)}{2m_i}\Delta t,
+$$
+
+$$
+\mathbf{r}_i(t+\Delta t)
+=
+\mathbf{r}_i(t)
++
+\mathbf{v}_i(t+\Delta t/2)\Delta t .
+$$
+
+这类计算在代码中通常表现为按原子编号循环。对于不同原子之间相互独立或数据依赖较弱的循环，可以采用 OpenMP 进行线程级并行。
+
+本项目主要使用以下并行与测试方法：
+
+| 技术方法              | 用途                                   |
+| --------------------- | -------------------------------------- |
+| OpenMP `parallel for` | 并行化积分器和势函数中的原子循环       |
+| `private` 或局部变量  | 处理线程私有临时变量                   |
+| `reduction`           | 处理总能量、动能、温度等全局累加量     |
+| `OMP_NUM_THREADS`     | 控制不同线程数下的运行实验             |
+| 运行时间统计          | 计算加速比和并行效率                   |
+| CUDA kernel           | 有余力时尝试加速机器学习势函数局部计算 |
+
+性能评价指标为：
+$$
+S_p = \frac{T_1}{T_p},
+$$
+
+$$
+E_p = \frac{S_p}{p},
+$$
+
+其中 $T_1$ 表示单线程或原始版本运行时间，$T_p$ 表示 $p$ 线程运行时间，$S_p$ 为加速比，$E_p$ 为并行效率。
+
+------
+
+## 四、代码阅读与修改范围
+
+本项目重点关注当前仓库中与 MD 主循环、积分器、ESolver 势函数接口、DPMD/NEP 接入、晶胞更新和输入参数相关的代码。代码阅读与修改范围以下列当前实际存在的文件为准。
+
+| 类别               | 实际相关文件                                                 | 文件信息与阅读/修改重点                                      |
+| ------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| MD 主流程          | `source/source_md/run_md.cpp`、`source/source_md/run_md.h`   | `Run_MD::md_line()` 根据 `md_type` 选择 `FIRE`、`Nose_Hoover`、`Verlet`、`Langevin`、`MSST`，并组织 `setup()`、`first_half()`、`MD_func::force_virial()`、`second_half()`、`compute_stress()`、输出与 restart。用于理解完整时间步依赖，不作为主要并行修改点。 |
+| MD 公共基类        | `source/source_md/md_base.cpp`、`source/source_md/md_base.h` | 保存质量、速度、位置增量、力、应力、Virial 等公共状态；`update_pos()` 和 `update_vel()` 是积分器共用的原子循环，目前采用主进程计算后广播，是积分器 OpenMP 优化的首要修改对象。 |
+| Verlet 积分器      | `source/source_md/verlet.cpp`、`source/source_md/verlet.h`   | 实现 NVE/NVT Velocity-Verlet 流程；`first_half()` 调用半步速度和位置更新，`second_half()` 调用半步速度和简单温控器。重点阅读温控器 `thermalize()`、`apply_thermostat()` 中的原子循环与随机数逻辑。 |
+| Nose-Hoover 积分器 | `source/source_md/nhchain.cpp`、`source/source_md/nhchain.h` | 实现 NVT/NPT Nose-Hoover Chain、barostat、`update_baro()`、`vel_baro()`、`update_volume()` 等流程。作为可变胞和压力控制依赖阅读对象，避免积分器优化破坏速度、应力和晶胞更新顺序。 |
+| Langevin 积分器    | `source/source_md/langevin.cpp`、`source/source_md/langevin.h` | 在真实力基础上构造阻尼力和随机力 `total_force`；`post_force()` 是按原子循环，但涉及随机数和 MPI 广播，作为并行化风险较高的候选对象。 |
+| MSST 积分器        | `source/source_md/msst.cpp`、`source/source_md/msst.h`       | 实现冲击波约束动力学，包含 `propagate_vel()`、`propagate_voldot()`、`rescale()` 和晶胞重建。主要用于理解可变胞场景下 `update_pos()` 优化的边界条件。 |
+| FIRE 弛豫算法      | `source/source_md/fire.cpp`、`source/source_md/fire.h`       | 基于 `v·F` 调整速度、时间步长和收敛状态；`check_force()`、`check_fire()` 含原子循环与全局归约，是可选并行优化对象。 |
+| MD 工具函数        | `source/source_md/md_func.cpp`、`source/source_md/md_func.h` | 提供初始速度、动能、温度、应力、`force_virial()`、输出等工具函数；其中 `kinetic_energy()`、`current_temp()`、`temp_vector()`、`dump_info()`、`get_mass_mbl()` 等循环需要作为测试与后续归约优化参考。 |
+| MD 单元测试        | `source/source_md/test/CMakeLists.txt`、`source/source_md/test/verlet_test.cpp`、`source/source_md/test/nhchain_test.cpp`、`source/source_md/test/langevin_test.cpp`、`source/source_md/test/msst_test.cpp`、`source/source_md/test/fire_test.cpp`、`source/source_md/test/md_func_test.cpp`、`source/source_md/test/lj_pot_test.cpp`、`source/source_md/test/setcell.h` | 当前仓库已有 MD 相关单元测试，可作为修改后的正确性验证入口，并补充线程一致性或性能测试脚本。 |
+| ESolver 总入口     | `source/source_esolver/esolver.cpp`、`source/source_esolver/esolver.h` | `determine_type()` 与 `init_esolver()` 根据 `esolver_type` 创建 `ESolver_DP`、`ESolver_NEP`、`ESolver_LJ` 等对象，是 MD 与势函数实现之间的接口入口。 |
+| NEP 接口           | `source/source_esolver/esolver_nep.cpp`、`source/source_esolver/esolver_nep.h` | ABACUS 侧负责 type map、坐标/晶格数组转换、调用 `nep.compute()`、能量/力/Virial 单位转换和后处理；核心 NEP 推理在外部 NEP 库中，ABACUS 侧主要优化坐标转换、力后处理和 Virial 求和。 |
+| DPMD 接口          | `source/source_esolver/esolver_dp.cpp`、`source/source_esolver/esolver_dp.h` | ABACUS 侧负责读取 `dp_rescaling`、`dp_fparam`、`dp_aparam`、type map、坐标/晶格转换、调用 `dp.compute()`、力和 Virial 后处理；核心 DeePMD 推理在外部 DeePMD-kit 中。 |
+| LJ 参考势          | `source/source_esolver/esolver_lj.cpp`、`source/source_esolver/esolver_lj.h` | 当前仓库包含 LJ 势函数和对应 MD 测试，可用于不依赖外部 DPMD/NEP 库的 MD 流程正确性与性能基准。 |
+| 晶胞与位置更新依赖 | `source/source_cell/update_cell.cpp`、`source/source_cell/update_cell.h`、`source/source_cell/unitcell.cpp`、`source/source_cell/unitcell.h`、`source/source_cell/unitcell_data.h` | `MD_base::update_pos()`、NPT/MSST、restart 输出依赖这些函数更新直接坐标、速度和晶胞派生量。并行修改 MD 位置循环时必须保持 `UnitCell` 更新顺序。 |
+| MD 输入参数        | `source/source_io/module_parameter/md_parameter.h`、`source/source_io/module_parameter/input_parameter.h`、`source/source_io/module_parameter/parameter.h`、`source/source_io/module_parameter/read_input_item_md.cpp` | 定义并读取 `md_type`、`md_thermostat`、`md_dt`、`md_nstep`、`pot_file`、`dp_rescaling` 等参数。用于确定测试输入和 DPMD/NEP 编译运行条件。 |
+| 构建配置           | `source/source_md/CMakeLists.txt`、`source/source_esolver/CMakeLists.txt`、`source/source_cell/CMakeLists.txt`、`source/source_io/module_parameter/CMakeLists.txt`、`CMakeLists.txt` | 用于确认新增 OpenMP 代码、测试文件和 DPMD/NEP 编译宏 `__DPMD`、`__NEP` 的构建关系。 |
+| 外部测试脚本       | 自建脚本                                                     | 在已有单元测试基础上，自动设置 `OMP_NUM_THREADS`、运行小体系 MD、比较关键输出并统计耗时。 |
+
+代码阅读重点包括：
+
+1. MD 主循环的执行顺序；
+2. `MD_func::force_virial()` 调用 ESolver 的位置；
+3. `MD_base::update_pos()`、`MD_base::update_vel()` 及各派生积分器的调用顺序；
+4. 按原子编号循环的核心代码；
+5. 每个循环中的读写关系；
+6. 全局累加变量的处理方式；
+7. 线程并行可能涉及的数据竞争；
+8. MPI 广播、随机数、restart 和可变胞更新对并行修改的限制；
+9. 可以用于正确性验证的输出量。
+
+------
+
+## 五、实现内容
+
+### 5.1 积分器 OpenMP 并行优化
+
+积分器部分作为项目的基础实现模块。计划阅读 `MD_base`、`Verlet` 及其它派生积分器代码，找到位置、速度、力后处理和统计量计算中的原子循环，并对适合并行的循环加入 OpenMP。
+
+主要工作包括：
+
+1. 梳理积分器计算流程；
+2. 找到 `update_pos()`、`update_vel()` 和相关统计函数中的核心原子循环；
+3. 分析循环迭代间的数据依赖；
+4. 使用 OpenMP 并行化原子循环；
+5. 对动能、温度、最大力等统计量使用归约处理；
+6. 保持原有函数接口和调用方式；
+7. 记录修改位置和优化逻辑。
+
+对应测试包括：
+
+| 测试内容       | 说明                               |
+| -------------- | ---------------------------------- |
+| 编译测试       | 确认修改后代码可以正常编译         |
+| 运行测试       | 使用相同输入运行原始版本和并行版本 |
+| 结果一致性测试 | 比较位置、速度、能量等关键输出     |
+| 多线程测试     | 测试 1、2、4、8 线程               |
+| 性能测试       | 记录不同线程数下运行时间           |
+| 加速比分析     | 计算加速比和并行效率               |
+
+------
+
+### 5.2 测试与性能基准框架
+
+测试框架贯穿整个项目，用于验证积分器优化、势函数优化和 CUDA 扩展结果。
+
+主要工作包括：
+
+1. 编写自动运行脚本；
+2. 自动设置不同线程数；
+3. 保存不同配置下的输出日志；
+4. 提取运行时间；
+5. 比较串行和并行结果；
+6. 生成性能数据表；
+7. 支持后续势函数和 CUDA 测试复用。
+
+测试脚本重点支持以下配置：
+
+| 配置               | 含义                       |
+| ------------------ | -------------------------- |
+| 单线程运行         | 作为基准结果               |
+| 多线程 OpenMP 运行 | 测试线程级并行效果         |
+| 不同体系规模运行   | 观察原子数变化对性能的影响 |
+| CPU / GPU 对比     | 用于 CUDA 扩展阶段         |
+
+------
+
+### 5.3 机器学习势函数 OpenMP 并行优化
+
+势函数部分是项目的核心优化内容。当前 ABACUS 仓库中的 DPMD 和 NEP 核心推理分别由外部 DeePMD-kit 和 NEP 库完成；ABACUS 侧主要负责 type map、坐标/晶格格式转换、单位转换、力和 Virial 后处理。计划在 DPMD 或 NEP 接口中选择一个主要对象，识别 ABACUS 侧以原子为单位的热点循环，并进行 OpenMP 并行优化。
+
+主要工作包括：
+
+1. 阅读 ESolver 势函数接口；
+2. 选择 DPMD 或 NEP 作为主要分析对象；
+3. 梳理 ABACUS 侧数据准备、外部库调用和结果回填流程；
+4. 识别坐标转换、力后处理、Virial 求和等热点循环；
+5. 分析共享变量和数组写入关系；
+6. 使用 OpenMP 并行化合适的循环；
+7. 对 NEP 能量或 Virial 求和等累加量使用 reduction；
+8. 使用测试框架验证不同线程数下结果一致性；
+9. 记录局部运行时间和完整 MD 运行时间。
+
+测试内容包括：
+
+| 测试内容   | 说明                             |
+| ---------- | -------------------------------- |
+| 能量一致性 | 比较串行与并行版本势能结果       |
+| 力一致性   | 比较不同原子的受力结果           |
+| 线程一致性 | 测试 1、2、4、8 线程输出是否稳定 |
+| 局部耗时   | 统计势函数计算部分耗时           |
+| 整体耗时   | 统计完整 MD 运行时间             |
+| 性能分析   | 计算加速比和并行效率             |
+
+------
+
+### 5.4 DPMD 或 NEP 专项深入优化
+
+在完成势函数 OpenMP 基础优化后，选择 DPMD 或 NEP 之一进行深入优化。若不修改外部库，专项优化范围主要限定在 ABACUS 接口层和调用边界。
+
+若选择 DPMD，重点分析：
+
+1. DeePMD type map 和输入参数传递；
+2. `coord`、`cell`、`atype` 的组织方式；
+3. `dp.compute()` 前后的单位转换和结果回填；
+4. 临时数组与内存访问模式；
+5. 不同体系规模下的性能表现。
+
+若选择 NEP，重点分析：
+
+1. NEP type map 和多元素系统处理；
+2. `coord`、`cell`、`_e`、`_f`、`_v` 的组织方式；
+3. 每原子能量与 Virial 求和；
+4. 数组访问顺序和局部缓存；
+5. 不同体系规模下的性能表现。
+
+GPUMD 的 NEP 文档说明，NEP 使用描述符向量表示原子局部环境，并通过简单前馈神经网络表示单个原子的 site energy；其径向和角向描述符都涉及对邻居原子的求和，这为理解 NEP 中的原子循环和并行机会提供了参考。([GPUMD](https://gpumd.org/potentials/nep.html))
+
+------
+
+### 5.5 CUDA 加速机器学习势函数的扩展尝试
+
+CUDA 部分作为项目扩展内容。计划在已经完成 OpenMP 分析的基础上，选择 ABACUS 接口层中可独立抽取的局部计算环节尝试 GPU 加速；若要加速 DPMD/NEP 核心神经网络或描述符计算，需要进入对应外部库，不作为本项目基本目标。
+
+候选对象包括：
+
+1. 坐标数组格式转换；
+2. 力数组后处理；
+3. NEP Virial 求和；
+4. 单位缩放；
+5. 大量原子上的相同结构循环。
+
+主要工作包括：
+
+1. 分析候选计算是否适合 GPU 并行；
+2. 设计 CPU 与 GPU 数据传输方式；
+3. 编写局部 CUDA kernel；
+4. 对比 CPU/OpenMP 和 CUDA 版本结果；
+5. 测试不同体系规模下的运行时间；
+6. 分析 GPU 加速中的数据传输开销。
+
+CUDA 部分的目标是完成一个局部、可测试、可对比的 GPU 加速尝试，为报告提供机器学习势函数异构并行优化的扩展内容。
+
+------
+
+## 六、测试体系与实验设计
+
+本项目采用由小到大的测试体系，先验证正确性，再进行性能分析。课程说明中建议的测试体系包括水、硅、铝和蛋白质等规模由小到大的体系。
+
+| 测试体系   | 原子数     | 用途                     |
+| ---------- | ---------- | ------------------------ |
+| 小型体系   | 几十个原子 | 快速编译和运行验证       |
+| 水分子体系 | 64         | 初级正确性测试           |
+| 硅晶体体系 | 128        | 基准性能测试             |
+| 铝体系     | 256        | 多线程性能测试           |
+| 更大体系   | 512 以上   | 深入优化与 CUDA 扩展测试 |
+
+实验安排如下：
+
+| 实验       | 内容                                          | 目的                        |
+| ---------- | --------------------------------------------- | --------------------------- |
+| 基准实验   | 运行原始版本                                  | 获得 baseline 时间          |
+| 积分器实验 | 对积分器进行 OpenMP 优化                      | 测试基础并行效果            |
+| 积分器测试 | 比较串行和并行结果                            | 验证位置、速度、能量一致性  |
+| 势函数实验 | 对 DPMD 或 NEP 接口层热点循环进行 OpenMP 优化 | 测试接口层优化效果          |
+| 势函数测试 | 比较能量和力                                  | 验证势函数并行正确性        |
+| 专项实验   | 深入分析 DPMD 或 NEP                          | 展示机器学习势函数优化效果  |
+| CUDA 实验  | 对局部计算尝试 GPU 加速                       | 比较 CPU/OpenMP 与 GPU 性能 |
+
+性能结果记录表如下：
+
+| 模块          | 体系     | 原子数 | 线程/设备 | 运行时间 / s | 加速比 | 并行效率 |
+| ------------- | -------- | ------ | --------- | ------------ | ------ | -------- |
+| 积分器        | 水       | 64     | 1 线程    | 待测         | 1.00   | 1.00     |
+| 积分器        | 水       | 64     | 4 线程    | 待测         | 待测   | 待测     |
+| 势函数        | 硅       | 128    | 1 线程    | 待测         | 1.00   | 1.00     |
+| 势函数        | 硅       | 128    | 4 线程    | 待测         | 待测   | 待测     |
+| DPMD / NEP    | 铝       | 256    | 8 线程    | 待测         | 待测   | 待测     |
+| CUDA 局部计算 | 更大体系 | 512+   | GPU       | 待测         | 待测   | 待测     |
+
+------
+
+## 七、成员分工
+
+本项目采用“共同推进基础流程 + 模块主责分工”的方式。
+
+| 成员             | 主责方向             | 主要任务                                                     |
+| ---------------- | -------------------- | ------------------------------------------------------------ |
+| 成员 A（陈怀瑜） | 积分器并行优化       | 阅读积分器代码，完成 OpenMP 并行修改，记录优化逻辑和局部性能结果 |
+| 成员 B（胡婧瑶） | 机器学习势函数优化   | 阅读 DPMD 或 NEP 接口代码，完成热点分析、OpenMP 优化和专项深入优化 |
+| 成员 C（李嘉宁） | 测试框架与 CUDA 扩展 | 编写测试脚本和 benchmark 工具，整理性能数据；有余力时负责 CUDA 局部加速尝试 |
+
+协作安排如下：
+
+1. 三人共同完成 ABACUS 编译、baseline 运行和 MD 主流程阅读；
+2. 成员 A 优先推进积分器并行，成员 C 同步完成积分器测试脚本；
+3. 成员 B 同步阅读势函数代码，随后推进势函数 OpenMP 优化；
+4. 成员 C 的测试框架服务于积分器、势函数和 CUDA 三个阶段；
+5. 成员 B 与成员 C 共同确定 CUDA 候选计算环节；
+6. 所有代码修改通过 Git 分支管理，合并前完成编译和基本运行测试；
+7. 最终报告由三人共同整理，成员 C 汇总性能数据，成员 A 和 B 分别完成对应模块说明。
+
+------
+
+## 八、进度安排
+
+结合课程大作业从计划书、算法文档、测试总结、代码修改、优化报告到最终报告的整体节奏，本项目按六个阶段推进。
+
+| 阶段                                 | 时间范围               | 主要任务                                                     | 阶段成果                             |
+| ------------------------------------ | ---------------------- | ------------------------------------------------------------ | ------------------------------------ |
+| 阶段一：项目调研与环境准备           | 第 10 周前后           | 明确选题方向，阅读课程说明，配置代码环境，编译并运行 ABACUS 最小 MD 示例 | 项目计划书，baseline 运行记录        |
+| 阶段二：代码结构与算法流程梳理       | 第 11 至 13 周前半     | 阅读 `run_md.cpp`、`md_base.cpp`、各积分器和 ESolver 势函数接口，画出 MD 主流程，确定积分器与势函数优化位置 | 算法流程文档，代码入口说明           |
+| 阶段三：积分器 OpenMP 优化与测试     | 第 13 周后半至第 14 周 | 完成 `MD_base`/`Verlet` 相关原子循环的 OpenMP 并行，建立积分器正确性测试和性能测试 | 积分器并行代码，测试数据，初步加速比 |
+| 阶段四：势函数 OpenMP 优化与专项分析 | 第 14 至 15 周         | 选择 DPMD 或 NEP，完成热点循环分析和局部 OpenMP 优化，整理能量/力一致性测试 | 势函数并行代码，热点分析，性能对比   |
+| 阶段五：深入优化与 CUDA 扩展         | 第 15 至 16 周         | 深入 DPMD 或 NEP 的 ABACUS 接口流程，优化内存访问或循环结构；尝试局部 CUDA kernel | 专项优化报告，CUDA 尝试结果          |
+| 阶段六：结果汇总与最终报告           | 期末前                 | 汇总代码修改、测试结果、性能图表和项目结论，完成最终报告和展示材料 | 最终报告，代码说明，完整测试结果     |
+
+------
+
+## 九、预期成果
+
+本项目预期提交以下成果：
+
+1. 项目计划书；
+2. 算法流程与代码结构说明文档；
+3. 积分器 OpenMP 并行优化代码；
+4. 积分器正确性测试和性能测试结果；
+5. 机器学习势函数 OpenMP 并行优化代码；
+6. 势函数能量、力一致性测试结果；
+7. DPMD 或 NEP 专项深入优化分析；
+8. *CUDA 局部加速尝试代码和性能对比；
+9. benchmark 脚本和运行说明；
+10. 运行时间、加速比、并行效率表格；
+11. 代码修改与重构说明；
+12. 最终优化效果和总结报告。
+
+------
+
+## 十、风险控制与协作方式
+
+| 风险                  | 控制方式                                                     |
+| --------------------- | ------------------------------------------------------------ |
+| 代码规模较大          | 以 `run_md.cpp`、`md_base.cpp`、积分器和 ESolver 势函数接口为阅读主线 |
+| OpenMP 数据竞争       | 重点检查共享变量、全局累加量和数组写入                       |
+| 浮点结果存在微小差异  | 使用误差阈值比较能量、力、位置和速度                         |
+| 整体加速比受限        | 同时统计局部模块耗时和完整 MD 耗时                           |
+| CUDA 数据传输开销较大 | 选择计算密集且循环结构清晰的局部计算作为加速对象             |
+| 三人代码合并冲突      | 使用 GitHub 分支管理，少量多次提交，合并前编译测试           |
+| 测试结果难复现        | 统一输入文件、线程数配置、日志格式和数据记录方式             |
+
+------
+
+## 十一、参考文献与参考资料
+
+1. 课程项目说明文件：`06_md.md`。其中包含项目背景、ABACUS 分子动力学模块结构、建议优化方向、测试体系和提交要求。
+2. ABACUS 官方代码仓库：
+   https://github.com/deepmodeling/abacus-develop
+   用于阅读分子动力学模块、积分器模块、势函数接口和机器学习势函数相关代码。课程说明中也将该仓库列为参考资源。
+3. ABACUS Pull Request #6603：
+   https://github.com/deepmodeling/abacus-develop/pull/6603
+   该 PR 主题为 “Add NEP as esolver”，可作为了解 NEP 与 ABACUS 接口关系的参考。([GitHub](https://github.com/deepmodeling/abacus-develop/pull/6603))
+4. GPUMD NEP 文档：
+   https://gpumd.org/potentials/nep.html
+   该文档介绍了 NEP 的神经网络模型、描述符、模型维度和优化过程，可用于理解 NEP 中原子局部环境描述符与并行计算结构。([GPUMD](https://gpumd.org/potentials/nep.html))
+5. OpenMP 官方文档：
+   https://www.openmp.org/
+   用于查询 `parallel for`、`private`、`reduction`、线程数控制等 OpenMP 语法和并行编程规范。
+6. CUDA 官方文档：
+   https://docs.nvidia.com/cuda/
+   用于后续 CUDA kernel、线程组织、内存传输和 GPU 性能分析的学习与实现参考。
\ No newline at end of file
diff --git a/Results/0530_md_pre_parallel_refactor.md b/Results/0530_md_pre_parallel_refactor.md
index 08562dcae6e..47cd0d77381 100644
--- a/Results/0530_md_pre_parallel_refactor.md
+++ b/Results/0530_md_pre_parallel_refactor.md
@@ -1,8 +1,8 @@
-# ABACUS MD 并行优化前置重构记录
+# ABACUS MD 代码修改与重构报告
 
 ## 目标边界
 
-本轮重构只调整代码结构，为后续正式 OpenMP/CUDA 并行优化降低改动风险；不引入并行代码，不改变 MD 数值算法、输入输出格式和执行流程。
+本轮重构只调整代码结构，为后续正式 OpenMP/CUDA 并行优化降低改动风险，不引入并行代码。
 
 ## 重构点清单
 
@@ -22,14 +22,6 @@
 - DP/NEP 的 `before_all_runners()`、`runner()`、`cal_energy()`、`cal_force()`、`cal_stress()` 接口不变。
 - MD 测试中的数值断言未改写。
 
-## 未做事项
-
-- 未添加 `#pragma omp`。
-- 未调整 MPI 广播、MD 时间步顺序、温控/压控算法。
-- 未修改 DPMD/NEP 外部库调用语义。
-- 未加入 CUDA kernel。
-- 未把规划文件 `Planners/` 纳入提交。
-
 ## 验证记录
 
 生产代码构建：
diff --git a/Results/0531_openmp_nep_basic_changes.md b/Results/0531_openmp_nep_basic_changes.md
new file mode 100644
index 00000000000..6cc865fb1db
--- /dev/null
+++ b/Results/0531_openmp_nep_basic_changes.md
@@ -0,0 +1,209 @@
+# OpenMP 基本部分与 NEP 优化修改说明
+
+日期：2026-05-31
+
+本次根据 `Planners/0531/md_parallel_optimization_plan.md`，只实现了收益较明确、数据依赖简单的 OpenMP 优化点；暂未处理随机数相关循环、NHC/MSST 扩展项、DPMD 和 LJ。
+
+## 修改范围
+
+### 基本 MD 循环
+
+- `source/source_md/md_base.cpp`
+  - `MD_base::update_pos()`：对 rank 0 上的逐原子位置增量计算加入 `#pragma omp parallel for schedule(static)`。
+  - `MD_base::update_vel()`：对 rank 0 上的逐原子速度半步更新加入 OpenMP 并行。
+  - MPI 广播和 `unitcell::update_pos_taud()` 顺序保持不变。
+
+- `source/source_md/md_func.cpp`
+  - `kinetic_energy()`：使用 OpenMP `reduction(+:ke)` 并行动能求和。
+  - `force_virial()`：并行 `force_temp` 到 `force` 的逐原子回填。
+  - `temp_vector()`：用 9 个标量 reduction 并行温度张量累加，并显式写回 3x3 矩阵，避免依赖 `matrix::create()` 是否清零。
+
+### NEP 接口层
+
+- `source/source_esolver/esolver_nep.h`
+  - 新增 `atom_type_index` 和 `atom_local_index` 缓存，用于把全局原子序号映射到 `UnitCell` 的元素类型和类型内局部序号。
+
+- `source/source_esolver/esolver_nep.cpp`
+  - `before_all_runners()`：初始化全局原子到 `UnitCell` 存储位置的索引缓存。
+  - `runner()`：
+    - 并行 NEP 坐标缓冲区填充；
+    - 使用 OpenMP reduction 并行每原子能量求和；
+    - 并行 NEP 力回填和单位转换；
+    - 使用线程局部 9 分量数组并行 per-atom virial 求和。
+
+## 并行策略
+
+- 所有新增循环使用 `schedule(static)`。
+- 对短循环使用 `if (nat >= 256)` 或同类阈值，降低小体系线程启动开销。
+- 不改变 MD 时间步顺序、MPI 广播位置、NEP 外部库调用边界。
+- 浮点归约可能带来末位差异，后续测试应使用容差比较。
+
+## 未执行项
+
+- 本文件初版按当时要求未运行测试；2026-06-01 已补充独立 microbenchmark，见下方“性能测试记录”。
+- 未并行 `rand_vel()`、Anderson thermostat、Langevin 随机力等随机数相关循环。
+- 未改 DPMD、NHC/MSST 扩展优化、LJ benchmark 优化。
+
+## 性能测试记录
+
+### 测试文件与运行方式
+
+测试文件已放在代码文档专用测试目录：
+
+- `Test/openmp_nep_basic_benchmark.cpp`
+- `Test/run_openmp_nep_basic_benchmark.sh`
+- `Test/results/openmp_nep_basic_benchmark.csv`
+- `Test/results/openmp_nep_basic_benchmark.log`
+
+本次测试使用独立 C++ OpenMP microbenchmark，而不是完整 ABACUS 端到端 MD 输入。原因是 NEP 端到端测试依赖外部 NEP 模型文件和编译宏；本 benchmark 直接复现本次改动中真正发生变化的逐原子循环和 reduction 结构，可以隔离测量 MD 基础循环与 NEP 接口层后处理的 OpenMP 收益。
+
+编译与运行命令：
+
+```bash
+cd /root/abacus-md-refactor
+./Test/run_openmp_nep_basic_benchmark.sh
+```
+
+脚本内部编译命令：
+
+```bash
+g++ -O3 -std=c++17 -fopenmp Test/openmp_nep_basic_benchmark.cpp -o Test/build/openmp_nep_basic_benchmark
+```
+
+测试参数：
+
+| 参数 | 值 |
+| --- | --- |
+| 编译器 | g++ 11.3.0 |
+| 原子数 | 2,000,000 |
+| 每项重复次数 | 5 |
+| 线程数 | 1 / 2 / 4 / 8 |
+| OpenMP 绑定 | `OMP_PROC_BIND=close`, `OMP_PLACES=cores` |
+
+每个 kernel 同时运行串行版本和 OpenMP 版本，表中 `serial_ms` 与 `omp_ms` 均为单次调用平均耗时。`speedup = serial_ms / omp_ms`，`efficiency = speedup / threads`。
+
+### 测试结果
+
+| kernel | threads | serial_ms | omp_ms | speedup | efficiency | max_abs_diff |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| md_update_pos | 1 | 13.801657 | 13.688899 | 1.008237 | 1.008237 | 0 |
+| md_update_pos | 2 | 13.631519 | 6.941512 | 1.963768 | 0.981884 | 0 |
+| md_update_pos | 4 | 13.600572 | 3.442708 | 3.950545 | 0.987636 | 0 |
+| md_update_pos | 8 | 13.564951 | 1.819345 | 7.455954 | 0.931994 | 0 |
+| md_update_vel | 1 | 15.260198 | 15.150155 | 1.007263 | 1.007263 | 0 |
+| md_update_vel | 2 | 15.062960 | 7.496135 | 2.009430 | 1.004715 | 0 |
+| md_update_vel | 4 | 14.971725 | 3.858649 | 3.880043 | 0.970011 | 0 |
+| md_update_vel | 8 | 14.953506 | 2.099304 | 7.123076 | 0.890385 | 0 |
+| md_kinetic_energy | 1 | 7.883401 | 7.899684 | 0.997939 | 0.997939 | 0 |
+| md_kinetic_energy | 2 | 7.545974 | 3.789572 | 1.991247 | 0.995623 | 2.036415e-10 |
+| md_kinetic_energy | 4 | 7.450567 | 2.004642 | 3.716657 | 0.929164 | 2.242899e-10 |
+| md_kinetic_energy | 8 | 7.387472 | 1.038517 | 7.113485 | 0.889186 | 1.775646e-10 |
+| md_temp_vector | 1 | 9.095514 | 8.946827 | 1.016619 | 1.016619 | 0 |
+| md_temp_vector | 2 | 9.008466 | 4.422689 | 2.036875 | 1.018438 | 1.600000e-10 |
+| md_temp_vector | 4 | 8.810445 | 2.291496 | 3.844844 | 0.961211 | 1.489155e-10 |
+| md_temp_vector | 8 | 8.711893 | 1.178607 | 7.391688 | 0.923961 | 1.680718e-10 |
+| md_force_copy | 1 | 10.244239 | 10.152378 | 1.009048 | 1.009048 | 0 |
+| md_force_copy | 2 | 10.280891 | 5.134739 | 2.002223 | 1.001111 | 0 |
+| md_force_copy | 4 | 10.098843 | 2.623814 | 3.848918 | 0.962229 | 0 |
+| md_force_copy | 8 | 10.094556 | 1.422335 | 7.097174 | 0.887147 | 0 |
+| nep_coord_fill | 1 | 11.474562 | 11.355256 | 1.010507 | 1.010507 | 0 |
+| nep_coord_fill | 2 | 11.422869 | 5.694671 | 2.005887 | 1.002944 | 0 |
+| nep_coord_fill | 4 | 11.268058 | 2.930989 | 3.844456 | 0.961114 | 0 |
+| nep_coord_fill | 8 | 11.123313 | 1.572314 | 7.074487 | 0.884311 | 0 |
+| nep_energy_sum | 1 | 3.048584 | 3.015369 | 1.011015 | 1.011015 | 0 |
+| nep_energy_sum | 2 | 3.080593 | 1.524430 | 2.020816 | 1.010408 | 8.811185e-09 |
+| nep_energy_sum | 4 | 3.075540 | 0.761431 | 4.039160 | 1.009790 | 2.277375e-08 |
+| nep_energy_sum | 8 | 3.103870 | 0.386627 | 8.028073 | 1.003509 | 1.861918e-08 |
+| nep_force_fill | 1 | 9.623283 | 9.488546 | 1.014200 | 1.014200 | 0 |
+| nep_force_fill | 2 | 9.581044 | 4.787944 | 2.001077 | 1.000538 | 0 |
+| nep_force_fill | 4 | 9.346130 | 2.425673 | 3.853005 | 0.963251 | 0 |
+| nep_force_fill | 8 | 9.329194 | 1.263027 | 7.386377 | 0.923297 | 0 |
+| nep_virial_sum | 1 | 29.281022 | 13.105685 | 2.234223 | 2.234223 | 0 |
+| nep_virial_sum | 2 | 29.324353 | 6.786994 | 4.320669 | 2.160334 | 9.997166e-09 |
+| nep_virial_sum | 4 | 29.311881 | 3.565785 | 8.220315 | 2.055079 | 6.324626e-09 |
+| nep_virial_sum | 8 | 29.241326 | 2.000102 | 14.619914 | 1.827489 | 8.529241e-09 |
+
+### 8 线程优化程度汇总
+
+| 优化点 | 对应源码位置 | 8 线程加速比 | 优化程度判断 |
+| --- | --- | ---: | --- |
+| 位置更新 `update_pos()` | `source/source_md/md_base.cpp` | 7.46x | 接近线性扩展 |
+| 速度更新 `update_vel()` | `source/source_md/md_base.cpp` | 7.12x | 接近线性扩展 |
+| 动能归约 `kinetic_energy()` | `source/source_md/md_func.cpp` | 7.11x | 接近线性扩展 |
+| 温度张量 `temp_vector()` | `source/source_md/md_func.cpp` | 7.39x | 接近线性扩展 |
+| 力回填 `force_virial()` | `source/source_md/md_func.cpp` | 7.10x | 接近线性扩展 |
+| NEP 坐标填充 | `source/source_esolver/esolver_nep.cpp` | 7.07x | 接近线性扩展 |
+| NEP 能量求和 | `source/source_esolver/esolver_nep.cpp` | 8.03x | 接近线性，轻微超线性来自计时波动和 reduction 分块缓存收益 |
+| NEP 力回填 | `source/source_esolver/esolver_nep.cpp` | 7.39x | 接近线性扩展 |
+| NEP virial 求和 | `source/source_esolver/esolver_nep.cpp` | 14.62x | 改进后表现很好，超线性主要来自算法写法减少循环开销和整数除法 |
+
+### 分析
+
+本次 OpenMP 优化收益最明确的是逐原子写入类循环，包括 `update_pos()`、`update_vel()`、`force_virial()` 力回填、NEP 坐标填充和 NEP 力回填。这些循环的迭代之间没有写冲突，每个线程处理独立原子编号，使用 `schedule(static)` 后负载均匀、同步开销低。因此在 8 线程下普遍达到约 7x 到 8x 的加速，适合作为优先合入的低风险优化点。
+
+动能和温度张量归约也表现较好。`kinetic_energy()` 使用单个 `reduction(+:ke)`，8 线程加速约 7.11x；`temp_vector()` 使用 9 个标量 reduction，8 线程加速约 7.39x。二者均属于低风险、高收益的 reduction 优化。归约结果与串行存在 `1e-10` 量级差异，来源是并行归约改变了浮点求和顺序。
+
+NEP 接口层的坐标填充、能量求和、力回填均有明显收益。坐标和力回填是典型内存带宽型循环，在 8 线程下分别达到约 7.07x 和 7.39x；能量求和是简单 reduction，8 线程约 8.03x。它们不改变 `nep.compute()` 外部库调用边界，只减少 ABACUS 侧数据准备和后处理的串行开销，适合大原子数 ML-MD。
+
+NEP virial 求和已从最初的线程局部数组加 `critical` 合并，改为 9 个显式标量 reduction。新写法避免了 `critical` 同步，也避免了循环内 `index / nat` 整数除法；同时把 9 个 virial 分量放在同一个按原子编号的循环中处理，减少循环控制开销。复测后 8 线程加速约 14.62x，1 线程 OpenMP 版本也快于原串行写法。这里的“超线性”不能简单理解为线程扩展超过物理上限，而是因为 OpenMP 版本的循环组织本身比原串行双层循环更高效。
+
+正确性方面，所有逐原子写入类 kernel 的 `max_abs_diff` 为 0；归约类 kernel 的误差在 `1e-8` 以内，属于浮点并行归约改变求和顺序导致的末位差异。该结果符合本次方案中“使用容差比较，不追求 bitwise 相同”的预期。
+
+## 正式构建与单测验证
+
+2026-06-02 针对 PR 准备补充了正式 ABACUS 构建和 MD 单测。microbenchmark 只用于隔离评估循环性能，不能替代以下正式验证。
+
+### 生产构建
+
+使用已有 `build-prod` 配置构建完整串行 PW 目标：
+
+```bash
+cmake --build build-prod -j2
+```
+
+结果：构建通过，最终目标 `abacus_pw_ser` 成功生成。构建过程中 `esolver` object library 也完成构建，并生成：
+
+```text
+build-prod/source/source_esolver/CMakeFiles/esolver.dir/esolver_nep.cpp.o
+```
+
+这说明当前 `source/source_esolver/esolver_nep.cpp` 在 ABACUS 正式构建配置中可编译通过。当前机器未安装 NEP 外部库，未找到 `nep.h` 或 `libnep`，因此本次无法验证定义 `__NEP` 后与真实 NEP 库的链接配置。
+
+### MD 单元测试
+
+先构建 MD 相关测试目标：
+
+```bash
+cmake --build build-mpi-test --target MODULE_MD_LJ_pot MODULE_MD_func MODULE_MD_fire MODULE_MD_verlet MODULE_MD_nhc MODULE_MD_msst MODULE_MD_lgv -j2
+```
+
+结果：7 个 MD 测试目标全部构建通过。
+
+随后运行 MD 测试集合：
+
+```bash
+ctest --test-dir build-mpi-test -R 'MODULE_MD' --output-on-failure
+```
+
+结果：
+
+```text
+100% tests passed, 0 tests failed out of 7
+
+MODULE_MD_LJ_pot  Passed
+MODULE_MD_func    Passed
+MODULE_MD_fire    Passed
+MODULE_MD_verlet  Passed
+MODULE_MD_nhc     Passed
+MODULE_MD_msst    Passed
+MODULE_MD_lgv     Passed
+```
+
+## 回溯方式
+
+本次修改已按 git commit 分开记录：
+
+- 代码优化提交：`optimize: add OpenMP to MD base loops and NEP interface`
+- 文档提交：`docs: record OpenMP NEP and MD base changes`
+
+如需回滚，可对对应提交执行 `git revert <commit>`。
diff --git a/Test/openmp_nep_basic_benchmark.cpp b/Test/openmp_nep_basic_benchmark.cpp
new file mode 100644
index 00000000000..16c2ff4e72b
--- /dev/null
+++ b/Test/openmp_nep_basic_benchmark.cpp
@@ -0,0 +1,611 @@
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace
+{
+
+struct Vec3
+{
+    double x;
+    double y;
+    double z;
+};
+
+struct Matrix3
+{
+    double m[3][3];
+};
+
+struct BenchmarkData
+{
+    int natom = 2000000;
+    int ntype = 4;
+    double md_dt = 0.5;
+    double lat0 = 10.0;
+    double fact_f = 0.0433641153087705;
+    Matrix3 gt{};
+
+    std::vector<Vec3> pos;
+    std::vector<Vec3> vel;
+    std::vector<Vec3> base_vel;
+    std::vector<Vec3> force;
+    std::vector<double> mass;
+    std::vector<std::array<int, 3>> ionmbl;
+    std::vector<double> force_matrix;
+
+    std::vector<int> atom_type_index;
+    std::vector<int> atom_local_index;
+    std::vector<int> type_offset;
+    std::vector<double> tau_x;
+    std::vector<double> tau_y;
+    std::vector<double> tau_z;
+    std::vector<double> nep_coord;
+    std::vector<double> nep_e;
+    std::vector<double> nep_f;
+    std::vector<double> nep_v;
+    std::vector<double> nep_force;
+};
+
+struct Measurement
+{
+    std::string kernel;
+    int threads = 1;
+    double serial_ms = 0.0;
+    double omp_ms = 0.0;
+    double max_abs_diff = 0.0;
+    double checksum = 0.0;
+};
+
+using Clock = std::chrono::steady_clock;
+volatile double global_sink = 0.0;
+
+double elapsed_ms(const Clock::time_point& start, const Clock::time_point& end)
+{
+    return std::chrono::duration<double, std::milli>(end - start).count();
+}
+
+int parse_int_arg(int argc, char** argv, const std::string& name, int fallback)
+{
+    for (int i = 1; i + 1 < argc; ++i)
+    {
+        if (argv[i] == name)
+        {
+            return std::atoi(argv[i + 1]);
+        }
+    }
+    return fallback;
+}
+
+double value_for_index(const int i, const double scale)
+{
+    return scale * (1.0 + static_cast<double>((i * 17) % 97) / 97.0);
+}
+
+BenchmarkData make_data(const int natom)
+{
+    BenchmarkData data;
+    data.natom = natom;
+    data.gt = {{{1.02, 0.01, 0.03}, {0.02, 0.98, 0.04}, {0.01, 0.02, 1.01}}};
+
+    data.pos.resize(natom);
+    data.vel.resize(natom);
+    data.base_vel.resize(natom);
+    data.force.resize(natom);
+    data.mass.resize(natom);
+    data.ionmbl.resize(natom);
+    data.force_matrix.resize(static_cast<std::size_t>(natom) * 3);
+    data.atom_type_index.resize(natom);
+    data.atom_local_index.resize(natom);
+    data.nep_coord.resize(static_cast<std::size_t>(natom) * 3);
+    data.nep_e.resize(natom);
+    data.nep_f.resize(static_cast<std::size_t>(natom) * 3);
+    data.nep_v.resize(static_cast<std::size_t>(natom) * 9);
+    data.nep_force.resize(static_cast<std::size_t>(natom) * 3);
+
+    data.type_offset.assign(data.ntype + 1, 0);
+    for (int it = 0; it < data.ntype; ++it)
+    {
+        data.type_offset[it + 1] = data.type_offset[it] + natom / data.ntype + (it < natom % data.ntype ? 1 : 0);
+    }
+    data.tau_x.resize(natom);
+    data.tau_y.resize(natom);
+    data.tau_z.resize(natom);
+
+    for (int i = 0; i < natom; ++i)
+    {
+        data.base_vel[i] = {value_for_index(i, 0.001), value_for_index(i + 11, 0.0012), value_for_index(i + 23, 0.0009)};
+        data.vel[i] = data.base_vel[i];
+        data.force[i] = {value_for_index(i + 3, 0.004), value_for_index(i + 5, 0.003), value_for_index(i + 7, 0.005)};
+        data.mass[i] = 10.0 + static_cast<double>(i % 13);
+        data.ionmbl[i] = {1, (i % 11) == 0 ? 0 : 1, (i % 17) == 0 ? 0 : 1};
+        data.force_matrix[3 * static_cast<std::size_t>(i)] = data.force[i].x;
+        data.force_matrix[3 * static_cast<std::size_t>(i) + 1] = data.force[i].y;
+        data.force_matrix[3 * static_cast<std::size_t>(i) + 2] = data.force[i].z;
+        data.nep_e[i] = value_for_index(i + 13, 0.02);
+        data.nep_f[i] = value_for_index(i + 29, 0.03);
+        data.nep_f[static_cast<std::size_t>(i) + natom] = value_for_index(i + 31, 0.02);
+        data.nep_f[static_cast<std::size_t>(i) + 2 * static_cast<std::size_t>(natom)] = value_for_index(i + 37, 0.04);
+    }
+
+    for (int it = 0; it < data.ntype; ++it)
+    {
+        for (int local = 0; local < data.type_offset[it + 1] - data.type_offset[it]; ++local)
+        {
+            const int global = data.type_offset[it] + local;
+            data.atom_type_index[global] = it;
+            data.atom_local_index[global] = local;
+            data.tau_x[global] = value_for_index(global + 41, 0.2);
+            data.tau_y[global] = value_for_index(global + 43, 0.3);
+            data.tau_z[global] = value_for_index(global + 47, 0.4);
+        }
+    }
+
+    for (int j = 0; j < 9; ++j)
+    {
+        for (int i = 0; i < natom; ++i)
+        {
+            data.nep_v[static_cast<std::size_t>(j) * natom + i] = value_for_index(i + j * 19, 0.0003 * (j + 1));
+        }
+    }
+
+    return data;
+}
+
+template <typename F>
+double time_repeated(const int repeat, F&& f)
+{
+    const auto start = Clock::now();
+    for (int r = 0; r < repeat; ++r)
+    {
+        f();
+    }
+    return elapsed_ms(start, Clock::now()) / repeat;
+}
+
+void update_pos_serial(const BenchmarkData& data, std::vector<Vec3>& pos)
+{
+    for (int i = 0; i < data.natom; ++i)
+    {
+        double px = data.ionmbl[i][0] ? data.vel[i].x * data.md_dt / data.lat0 : 0.0;
+        double py = data.ionmbl[i][1] ? data.vel[i].y * data.md_dt / data.lat0 : 0.0;
+        double pz = data.ionmbl[i][2] ? data.vel[i].z * data.md_dt / data.lat0 : 0.0;
+        pos[i].x = px * data.gt.m[0][0] + py * data.gt.m[1][0] + pz * data.gt.m[2][0];
+        pos[i].y = px * data.gt.m[0][1] + py * data.gt.m[1][1] + pz * data.gt.m[2][1];
+        pos[i].z = px * data.gt.m[0][2] + py * data.gt.m[1][2] + pz * data.gt.m[2][2];
+    }
+}
+
+void update_pos_omp(const BenchmarkData& data, std::vector<Vec3>& pos)
+{
+#pragma omp parallel for schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        double px = data.ionmbl[i][0] ? data.vel[i].x * data.md_dt / data.lat0 : 0.0;
+        double py = data.ionmbl[i][1] ? data.vel[i].y * data.md_dt / data.lat0 : 0.0;
+        double pz = data.ionmbl[i][2] ? data.vel[i].z * data.md_dt / data.lat0 : 0.0;
+        pos[i].x = px * data.gt.m[0][0] + py * data.gt.m[1][0] + pz * data.gt.m[2][0];
+        pos[i].y = px * data.gt.m[0][1] + py * data.gt.m[1][1] + pz * data.gt.m[2][1];
+        pos[i].z = px * data.gt.m[0][2] + py * data.gt.m[1][2] + pz * data.gt.m[2][2];
+    }
+}
+
+void update_vel_serial(const BenchmarkData& data, std::vector<Vec3>& vel)
+{
+    for (int i = 0; i < data.natom; ++i)
+    {
+        if (data.ionmbl[i][0])
+        {
+            vel[i].x += 0.5 * data.force[i].x * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][1])
+        {
+            vel[i].y += 0.5 * data.force[i].y * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][2])
+        {
+            vel[i].z += 0.5 * data.force[i].z * data.md_dt / data.mass[i];
+        }
+    }
+}
+
+void update_vel_omp(const BenchmarkData& data, std::vector<Vec3>& vel)
+{
+#pragma omp parallel for schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        if (data.ionmbl[i][0])
+        {
+            vel[i].x += 0.5 * data.force[i].x * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][1])
+        {
+            vel[i].y += 0.5 * data.force[i].y * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][2])
+        {
+            vel[i].z += 0.5 * data.force[i].z * data.md_dt / data.mass[i];
+        }
+    }
+}
+
+double kinetic_serial(const BenchmarkData& data)
+{
+    double ke = 0.0;
+    for (int i = 0; i < data.natom; ++i)
+    {
+        ke += 0.5 * data.mass[i]
+              * (data.vel[i].x * data.vel[i].x + data.vel[i].y * data.vel[i].y + data.vel[i].z * data.vel[i].z);
+    }
+    return ke;
+}
+
+double kinetic_omp(const BenchmarkData& data)
+{
+    double ke = 0.0;
+#pragma omp parallel for reduction(+:ke) schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        ke += 0.5 * data.mass[i]
+              * (data.vel[i].x * data.vel[i].x + data.vel[i].y * data.vel[i].y + data.vel[i].z * data.vel[i].z);
+    }
+    return ke;
+}
+
+std::array<double, 9> temp_vector_serial(const BenchmarkData& data)
+{
+    std::array<double, 9> t{};
+    for (int i = 0; i < data.natom; ++i)
+    {
+        const double m = data.mass[i];
+        const double vx = data.vel[i].x;
+        const double vy = data.vel[i].y;
+        const double vz = data.vel[i].z;
+        t[0] += m * vx * vx;
+        t[1] += m * vx * vy;
+        t[2] += m * vx * vz;
+        t[3] += m * vy * vx;
+        t[4] += m * vy * vy;
+        t[5] += m * vy * vz;
+        t[6] += m * vz * vx;
+        t[7] += m * vz * vy;
+        t[8] += m * vz * vz;
+    }
+    return t;
+}
+
+std::array<double, 9> temp_vector_omp(const BenchmarkData& data)
+{
+    double t00 = 0.0;
+    double t01 = 0.0;
+    double t02 = 0.0;
+    double t10 = 0.0;
+    double t11 = 0.0;
+    double t12 = 0.0;
+    double t20 = 0.0;
+    double t21 = 0.0;
+    double t22 = 0.0;
+
+#pragma omp parallel for reduction(+:t00, t01, t02, t10, t11, t12, t20, t21, t22) schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        const double m = data.mass[i];
+        const double vx = data.vel[i].x;
+        const double vy = data.vel[i].y;
+        const double vz = data.vel[i].z;
+        t00 += m * vx * vx;
+        t01 += m * vx * vy;
+        t02 += m * vx * vz;
+        t10 += m * vy * vx;
+        t11 += m * vy * vy;
+        t12 += m * vy * vz;
+        t20 += m * vz * vx;
+        t21 += m * vz * vy;
+        t22 += m * vz * vz;
+    }
+    return {t00, t01, t02, t10, t11, t12, t20, t21, t22};
+}
+
+void force_copy_serial(const BenchmarkData& data, std::vector<Vec3>& out)
+{
+    for (int i = 0; i < data.natom; ++i)
+    {
+        out[i].x = data.force_matrix[3 * static_cast<std::size_t>(i)];
+        out[i].y = data.force_matrix[3 * static_cast<std::size_t>(i) + 1];
+        out[i].z = data.force_matrix[3 * static_cast<std::size_t>(i) + 2];
+    }
+}
+
+void force_copy_omp(const BenchmarkData& data, std::vector<Vec3>& out)
+{
+#pragma omp parallel for schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        out[i].x = data.force_matrix[3 * static_cast<std::size_t>(i)];
+        out[i].y = data.force_matrix[3 * static_cast<std::size_t>(i) + 1];
+        out[i].z = data.force_matrix[3 * static_cast<std::size_t>(i) + 2];
+    }
+}
+
+void nep_coord_serial(const BenchmarkData& data, std::vector<double>& coord)
+{
+    const int nat = data.natom;
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        const int it = data.atom_type_index[iat];
+        const int ia = data.atom_local_index[iat];
+        const int index = data.type_offset[it] + ia;
+        coord[iat] = data.tau_x[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + nat] = data.tau_y[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + 2 * static_cast<std::size_t>(nat)] = data.tau_z[index] * data.lat0;
+    }
+}
+
+void nep_coord_omp(const BenchmarkData& data, std::vector<double>& coord)
+{
+    const int nat = data.natom;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        const int it = data.atom_type_index[iat];
+        const int ia = data.atom_local_index[iat];
+        const int index = data.type_offset[it] + ia;
+        coord[iat] = data.tau_x[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + nat] = data.tau_y[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + 2 * static_cast<std::size_t>(nat)] = data.tau_z[index] * data.lat0;
+    }
+}
+
+double nep_energy_serial(const BenchmarkData& data)
+{
+    return std::accumulate(data.nep_e.begin(), data.nep_e.end(), 0.0);
+}
+
+double nep_energy_omp(const BenchmarkData& data)
+{
+    double energy = 0.0;
+#pragma omp parallel for reduction(+:energy) schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        energy += data.nep_e[i];
+    }
+    return energy;
+}
+
+void nep_force_serial(const BenchmarkData& data, std::vector<double>& out)
+{
+    const int nat = data.natom;
+    for (int i = 0; i < nat; ++i)
+    {
+        out[3 * static_cast<std::size_t>(i)] = data.nep_f[i] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 1] = data.nep_f[static_cast<std::size_t>(i) + nat] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 2] = data.nep_f[static_cast<std::size_t>(i) + 2 * static_cast<std::size_t>(nat)] * data.fact_f;
+    }
+}
+
+void nep_force_omp(const BenchmarkData& data, std::vector<double>& out)
+{
+    const int nat = data.natom;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        out[3 * static_cast<std::size_t>(i)] = data.nep_f[i] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 1] = data.nep_f[static_cast<std::size_t>(i) + nat] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 2] = data.nep_f[static_cast<std::size_t>(i) + 2 * static_cast<std::size_t>(nat)] * data.fact_f;
+    }
+}
+
+std::array<double, 9> nep_virial_serial(const BenchmarkData& data)
+{
+    std::array<double, 9> sum{};
+    const int nat = data.natom;
+    for (int j = 0; j < 9; ++j)
+    {
+        for (int i = 0; i < nat; ++i)
+        {
+            sum[j] += data.nep_v[static_cast<std::size_t>(j) * nat + i];
+        }
+    }
+    return sum;
+}
+
+std::array<double, 9> nep_virial_omp(const BenchmarkData& data)
+{
+    const int nat = data.natom;
+    double v0 = 0.0;
+    double v1 = 0.0;
+    double v2 = 0.0;
+    double v3 = 0.0;
+    double v4 = 0.0;
+    double v5 = 0.0;
+    double v6 = 0.0;
+    double v7 = 0.0;
+    double v8 = 0.0;
+#pragma omp parallel for reduction(+:v0, v1, v2, v3, v4, v5, v6, v7, v8) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        v0 += data.nep_v[i];
+        v1 += data.nep_v[static_cast<std::size_t>(nat) + i];
+        v2 += data.nep_v[2 * static_cast<std::size_t>(nat) + i];
+        v3 += data.nep_v[3 * static_cast<std::size_t>(nat) + i];
+        v4 += data.nep_v[4 * static_cast<std::size_t>(nat) + i];
+        v5 += data.nep_v[5 * static_cast<std::size_t>(nat) + i];
+        v6 += data.nep_v[6 * static_cast<std::size_t>(nat) + i];
+        v7 += data.nep_v[7 * static_cast<std::size_t>(nat) + i];
+        v8 += data.nep_v[8 * static_cast<std::size_t>(nat) + i];
+    }
+    return {v0, v1, v2, v3, v4, v5, v6, v7, v8};
+}
+
+double max_abs_diff(const std::vector<Vec3>& a, const std::vector<Vec3>& b)
+{
+    double diff = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i)
+    {
+        diff = std::max(diff, std::abs(a[i].x - b[i].x));
+        diff = std::max(diff, std::abs(a[i].y - b[i].y));
+        diff = std::max(diff, std::abs(a[i].z - b[i].z));
+    }
+    return diff;
+}
+
+double max_abs_diff(const std::vector<double>& a, const std::vector<double>& b)
+{
+    double diff = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i)
+    {
+        diff = std::max(diff, std::abs(a[i] - b[i]));
+    }
+    return diff;
+}
+
+double max_abs_diff(const std::array<double, 9>& a, const std::array<double, 9>& b)
+{
+    double diff = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i)
+    {
+        diff = std::max(diff, std::abs(a[i] - b[i]));
+    }
+    return diff;
+}
+
+double checksum(const std::vector<Vec3>& a)
+{
+    double sum = 0.0;
+    for (std::size_t i = 0; i < a.size(); i += 4096)
+    {
+        sum += a[i].x + 0.5 * a[i].y + 0.25 * a[i].z;
+    }
+    return sum;
+}
+
+double checksum(const std::vector<double>& a)
+{
+    double sum = 0.0;
+    for (std::size_t i = 0; i < a.size(); i += 4096)
+    {
+        sum += a[i];
+    }
+    return sum;
+}
+
+double checksum(const std::array<double, 9>& a)
+{
+    return std::accumulate(a.begin(), a.end(), 0.0);
+}
+
+void print_measurement(const Measurement& m)
+{
+    const double speedup = m.omp_ms > 0.0 ? m.serial_ms / m.omp_ms : 0.0;
+    const double efficiency = m.threads > 0 ? speedup / m.threads : 0.0;
+    std::cout << m.kernel << ',' << m.threads << ','
+              << std::fixed << std::setprecision(6)
+              << m.serial_ms << ',' << m.omp_ms << ',' << speedup << ',' << efficiency << ','
+              << std::scientific << m.max_abs_diff << ',' << m.checksum << '\n';
+}
+
+} // namespace
+
+int main(int argc, char** argv)
+{
+    const int threads = parse_int_arg(argc, argv, "--threads", 1);
+    const int natom = parse_int_arg(argc, argv, "--natoms", 2000000);
+    const int repeat = parse_int_arg(argc, argv, "--repeat", 5);
+
+#ifdef _OPENMP
+    omp_set_dynamic(0);
+    omp_set_num_threads(threads);
+#endif
+
+    BenchmarkData data = make_data(natom);
+    std::vector<Vec3> serial_vec(natom);
+    std::vector<Vec3> omp_vec(natom);
+    std::vector<double> serial_doubles(static_cast<std::size_t>(natom) * 3);
+    std::vector<double> omp_doubles(static_cast<std::size_t>(natom) * 3);
+
+    std::cout << "kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum\n";
+
+    {
+        update_pos_serial(data, serial_vec);
+        update_pos_omp(data, omp_vec);
+        const double serial_ms = time_repeated(repeat, [&] { update_pos_serial(data, serial_vec); });
+        const double omp_ms = time_repeated(repeat, [&] { update_pos_omp(data, omp_vec); });
+        print_measurement({"md_update_pos", threads, serial_ms, omp_ms, max_abs_diff(serial_vec, omp_vec), checksum(omp_vec)});
+    }
+    {
+        std::vector<Vec3> serial_vel = data.base_vel;
+        std::vector<Vec3> omp_vel = data.base_vel;
+        update_vel_serial(data, serial_vel);
+        update_vel_omp(data, omp_vel);
+        const double serial_ms = time_repeated(repeat, [&] { update_vel_serial(data, serial_vel); });
+        const double omp_ms = time_repeated(repeat, [&] { update_vel_omp(data, omp_vel); });
+        print_measurement({"md_update_vel", threads, serial_ms, omp_ms, max_abs_diff(serial_vel, omp_vel), checksum(omp_vel)});
+    }
+    {
+        const double serial_value = kinetic_serial(data);
+        const double omp_value = kinetic_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += kinetic_serial(data); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += kinetic_omp(data); });
+        print_measurement({"md_kinetic_energy", threads, serial_ms, omp_ms, std::abs(serial_value - omp_value), omp_value});
+    }
+    {
+        const auto serial_value = temp_vector_serial(data);
+        const auto omp_value = temp_vector_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += checksum(temp_vector_serial(data)); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += checksum(temp_vector_omp(data)); });
+        print_measurement({"md_temp_vector", threads, serial_ms, omp_ms, max_abs_diff(serial_value, omp_value), checksum(omp_value)});
+    }
+    {
+        force_copy_serial(data, serial_vec);
+        force_copy_omp(data, omp_vec);
+        const double serial_ms = time_repeated(repeat, [&] { force_copy_serial(data, serial_vec); });
+        const double omp_ms = time_repeated(repeat, [&] { force_copy_omp(data, omp_vec); });
+        print_measurement({"md_force_copy", threads, serial_ms, omp_ms, max_abs_diff(serial_vec, omp_vec), checksum(omp_vec)});
+    }
+    {
+        nep_coord_serial(data, serial_doubles);
+        nep_coord_omp(data, omp_doubles);
+        const double serial_ms = time_repeated(repeat, [&] { nep_coord_serial(data, serial_doubles); });
+        const double omp_ms = time_repeated(repeat, [&] { nep_coord_omp(data, omp_doubles); });
+        print_measurement({"nep_coord_fill", threads, serial_ms, omp_ms, max_abs_diff(serial_doubles, omp_doubles), checksum(omp_doubles)});
+    }
+    {
+        const double serial_value = nep_energy_serial(data);
+        const double omp_value = nep_energy_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += nep_energy_serial(data); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += nep_energy_omp(data); });
+        print_measurement({"nep_energy_sum", threads, serial_ms, omp_ms, std::abs(serial_value - omp_value), omp_value});
+    }
+    {
+        nep_force_serial(data, serial_doubles);
+        nep_force_omp(data, omp_doubles);
+        const double serial_ms = time_repeated(repeat, [&] { nep_force_serial(data, serial_doubles); });
+        const double omp_ms = time_repeated(repeat, [&] { nep_force_omp(data, omp_doubles); });
+        print_measurement({"nep_force_fill", threads, serial_ms, omp_ms, max_abs_diff(serial_doubles, omp_doubles), checksum(omp_doubles)});
+    }
+    {
+        const auto serial_value = nep_virial_serial(data);
+        const auto omp_value = nep_virial_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += checksum(nep_virial_serial(data)); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += checksum(nep_virial_omp(data)); });
+        print_measurement({"nep_virial_sum", threads, serial_ms, omp_ms, max_abs_diff(serial_value, omp_value), checksum(omp_value)});
+    }
+
+    if (global_sink == -1.0)
+    {
+        std::cerr << "unreachable sink: " << global_sink << '\n';
+    }
+
+    return 0;
+}
diff --git a/Test/results/openmp_nep_basic_benchmark.csv b/Test/results/openmp_nep_basic_benchmark.csv
new file mode 100644
index 00000000000..0e3136e7095
--- /dev/null
+++ b/Test/results/openmp_nep_basic_benchmark.csv
@@ -0,0 +1,37 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,1,13.007151,13.052355,0.996537,0.996537,0.000000e+00,6.693102e-02
+md_update_vel,1,14.545481,14.496280,1.003394,1.003394,0.000000e+00,1.811520e+00
+md_kinetic_energy,1,6.925990,6.857271,1.010021,1.010021,0.000000e+00,1.205301e+02
+md_temp_vector,1,8.115525,8.224651,0.986732,0.986732,0.000000e+00,7.061666e+02
+md_force_copy,1,9.308984,9.119837,1.020740,1.020740,0.000000e+00,4.932209e+00
+nep_coord_fill,1,10.322396,10.129846,1.019008,1.019008,0.000000e+00,6.562763e+03
+nep_energy_sum,1,3.037125,3.027364,1.003224,1.003224,0.000000e+00,5.979382e+04
+nep_force_fill,1,8.900827,8.746296,1.017668,1.017668,0.000000e+00,2.851195e+00
+nep_virial_sum,1,28.520833,12.451097,2.290628,2.290628,0.000000e+00,4.036082e+04
+md_update_pos,2,13.098582,6.573173,1.992733,0.996367,0.000000e+00,6.693102e-02
+md_update_vel,2,14.820314,7.287271,2.033726,1.016863,0.000000e+00,1.811520e+00
+md_kinetic_energy,2,7.014427,3.325272,2.109430,1.054715,2.036415e-10,1.205301e+02
+md_temp_vector,2,8.208209,4.038653,2.032413,1.016206,1.600000e-10,7.061666e+02
+md_force_copy,2,9.362898,4.795636,1.952379,0.976190,0.000000e+00,4.932209e+00
+nep_coord_fill,2,10.446854,5.126669,2.037747,1.018874,0.000000e+00,6.562763e+03
+nep_energy_sum,2,3.059902,1.525828,2.005405,1.002702,8.811185e-09,5.979382e+04
+nep_force_fill,2,8.925261,4.459652,2.001336,1.000668,0.000000e+00,2.851195e+00
+nep_virial_sum,2,28.700600,6.623542,4.333120,2.166560,9.997166e-09,4.036082e+04
+md_update_pos,4,13.473068,3.496484,3.853319,0.963330,0.000000e+00,6.693102e-02
+md_update_vel,4,14.817903,3.852381,3.846427,0.961607,0.000000e+00,1.811520e+00
+md_kinetic_energy,4,7.070885,1.874085,3.772980,0.943245,2.242899e-10,1.205301e+02
+md_temp_vector,4,8.508072,2.182533,3.898256,0.974564,1.489155e-10,7.061666e+02
+md_force_copy,4,9.537895,2.563152,3.721158,0.930290,0.000000e+00,4.932209e+00
+nep_coord_fill,4,10.574139,2.837437,3.726652,0.931663,0.000000e+00,6.562763e+03
+nep_energy_sum,4,3.115476,0.785601,3.965723,0.991431,2.277375e-08,5.979382e+04
+nep_force_fill,4,9.159472,2.378972,3.850181,0.962545,0.000000e+00,2.851195e+00
+nep_virial_sum,4,28.671869,3.517298,8.151675,2.037919,6.323717e-09,4.036082e+04
+md_update_pos,8,13.427898,1.823981,7.361864,0.920233,0.000000e+00,6.693102e-02
+md_update_vel,8,14.830449,2.062811,7.189436,0.898679,0.000000e+00,1.811520e+00
+md_kinetic_energy,8,7.156275,1.025259,6.979966,0.872496,1.775788e-10,1.205301e+02
+md_temp_vector,8,8.650072,1.172442,7.377825,0.922228,1.680860e-10,7.061666e+02
+md_force_copy,8,9.885551,1.377531,7.176284,0.897036,0.000000e+00,4.932209e+00
+nep_coord_fill,8,11.209954,1.566864,7.154390,0.894299,0.000000e+00,6.562763e+03
+nep_energy_sum,8,3.211079,0.399830,8.031107,1.003888,1.861918e-08,5.979382e+04
+nep_force_fill,8,9.512588,1.360189,6.993579,0.874197,0.000000e+00,2.851195e+00
+nep_virial_sum,8,28.884186,2.028955,14.235989,1.779499,8.529241e-09,4.036082e+04
diff --git a/Test/results/run_1.csv b/Test/results/run_1.csv
new file mode 100644
index 00000000000..f988811bcee
--- /dev/null
+++ b/Test/results/run_1.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,1,13.007151,13.052355,0.996537,0.996537,0.000000e+00,6.693102e-02
+md_update_vel,1,14.545481,14.496280,1.003394,1.003394,0.000000e+00,1.811520e+00
+md_kinetic_energy,1,6.925990,6.857271,1.010021,1.010021,0.000000e+00,1.205301e+02
+md_temp_vector,1,8.115525,8.224651,0.986732,0.986732,0.000000e+00,7.061666e+02
+md_force_copy,1,9.308984,9.119837,1.020740,1.020740,0.000000e+00,4.932209e+00
+nep_coord_fill,1,10.322396,10.129846,1.019008,1.019008,0.000000e+00,6.562763e+03
+nep_energy_sum,1,3.037125,3.027364,1.003224,1.003224,0.000000e+00,5.979382e+04
+nep_force_fill,1,8.900827,8.746296,1.017668,1.017668,0.000000e+00,2.851195e+00
+nep_virial_sum,1,28.520833,12.451097,2.290628,2.290628,0.000000e+00,4.036082e+04
diff --git a/Test/results/run_2.csv b/Test/results/run_2.csv
new file mode 100644
index 00000000000..c56f0793298
--- /dev/null
+++ b/Test/results/run_2.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,2,13.098582,6.573173,1.992733,0.996367,0.000000e+00,6.693102e-02
+md_update_vel,2,14.820314,7.287271,2.033726,1.016863,0.000000e+00,1.811520e+00
+md_kinetic_energy,2,7.014427,3.325272,2.109430,1.054715,2.036415e-10,1.205301e+02
+md_temp_vector,2,8.208209,4.038653,2.032413,1.016206,1.600000e-10,7.061666e+02
+md_force_copy,2,9.362898,4.795636,1.952379,0.976190,0.000000e+00,4.932209e+00
+nep_coord_fill,2,10.446854,5.126669,2.037747,1.018874,0.000000e+00,6.562763e+03
+nep_energy_sum,2,3.059902,1.525828,2.005405,1.002702,8.811185e-09,5.979382e+04
+nep_force_fill,2,8.925261,4.459652,2.001336,1.000668,0.000000e+00,2.851195e+00
+nep_virial_sum,2,28.700600,6.623542,4.333120,2.166560,9.997166e-09,4.036082e+04
diff --git a/Test/results/run_4.csv b/Test/results/run_4.csv
new file mode 100644
index 00000000000..2f118db3395
--- /dev/null
+++ b/Test/results/run_4.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,4,13.473068,3.496484,3.853319,0.963330,0.000000e+00,6.693102e-02
+md_update_vel,4,14.817903,3.852381,3.846427,0.961607,0.000000e+00,1.811520e+00
+md_kinetic_energy,4,7.070885,1.874085,3.772980,0.943245,2.242899e-10,1.205301e+02
+md_temp_vector,4,8.508072,2.182533,3.898256,0.974564,1.489155e-10,7.061666e+02
+md_force_copy,4,9.537895,2.563152,3.721158,0.930290,0.000000e+00,4.932209e+00
+nep_coord_fill,4,10.574139,2.837437,3.726652,0.931663,0.000000e+00,6.562763e+03
+nep_energy_sum,4,3.115476,0.785601,3.965723,0.991431,2.277375e-08,5.979382e+04
+nep_force_fill,4,9.159472,2.378972,3.850181,0.962545,0.000000e+00,2.851195e+00
+nep_virial_sum,4,28.671869,3.517298,8.151675,2.037919,6.323717e-09,4.036082e+04
diff --git a/Test/results/run_8.csv b/Test/results/run_8.csv
new file mode 100644
index 00000000000..be310c03f17
--- /dev/null
+++ b/Test/results/run_8.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,8,13.427898,1.823981,7.361864,0.920233,0.000000e+00,6.693102e-02
+md_update_vel,8,14.830449,2.062811,7.189436,0.898679,0.000000e+00,1.811520e+00
+md_kinetic_energy,8,7.156275,1.025259,6.979966,0.872496,1.775788e-10,1.205301e+02
+md_temp_vector,8,8.650072,1.172442,7.377825,0.922228,1.680860e-10,7.061666e+02
+md_force_copy,8,9.885551,1.377531,7.176284,0.897036,0.000000e+00,4.932209e+00
+nep_coord_fill,8,11.209954,1.566864,7.154390,0.894299,0.000000e+00,6.562763e+03
+nep_energy_sum,8,3.211079,0.399830,8.031107,1.003888,1.861918e-08,5.979382e+04
+nep_force_fill,8,9.512588,1.360189,6.993579,0.874197,0.000000e+00,2.851195e+00
+nep_virial_sum,8,28.884186,2.028955,14.235989,1.779499,8.529241e-09,4.036082e+04
diff --git a/Test/run_openmp_nep_basic_benchmark.sh b/Test/run_openmp_nep_basic_benchmark.sh
new file mode 100755
index 00000000000..78d206fbe16
--- /dev/null
+++ b/Test/run_openmp_nep_basic_benchmark.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="${SCRIPT_DIR}/build"
+RESULT_DIR="${SCRIPT_DIR}/results"
+BIN="${BUILD_DIR}/openmp_nep_basic_benchmark"
+CSV="${RESULT_DIR}/openmp_nep_basic_benchmark.csv"
+LOG="${RESULT_DIR}/openmp_nep_basic_benchmark.log"
+
+NATOMS="${NATOMS:-2000000}"
+REPEAT="${REPEAT:-5}"
+CXX="${CXX:-g++}"
+
+mkdir -p "${BUILD_DIR}" "${RESULT_DIR}"
+
+{
+  echo "Compiler: $(${CXX} --version | head -n 1)"
+  echo "NATOMS=${NATOMS}"
+  echo "REPEAT=${REPEAT}"
+  echo "Build: ${CXX} -O3 -std=c++17 -fopenmp"
+} > "${LOG}"
+
+"${CXX}" -O3 -std=c++17 -fopenmp "${SCRIPT_DIR}/openmp_nep_basic_benchmark.cpp" -o "${BIN}" 2>&1 | tee -a "${LOG}"
+
+: > "${CSV}"
+for threads in 1 2 4 8; do
+  echo "Running with OMP_NUM_THREADS=${threads}" | tee -a "${LOG}"
+  export OMP_NUM_THREADS="${threads}"
+  export OMP_PROC_BIND="${OMP_PROC_BIND:-close}"
+  export OMP_PLACES="${OMP_PLACES:-cores}"
+  tmp_csv="${RESULT_DIR}/run_${threads}.csv"
+  "${BIN}" --threads "${threads}" --natoms "${NATOMS}" --repeat "${REPEAT}" > "${tmp_csv}"
+  if [[ "${threads}" == "1" ]]; then
+    cat "${tmp_csv}" >> "${CSV}"
+  else
+    tail -n +2 "${tmp_csv}" >> "${CSV}"
+  fi
+done
+
+echo "CSV: ${CSV}" | tee -a "${LOG}"
diff --git a/source/source_esolver/esolver_nep.cpp b/source/source_esolver/esolver_nep.cpp
index 9e57a5c4cda..b586983a647 100644
--- a/source/source_esolver/esolver_nep.cpp
+++ b/source/source_esolver/esolver_nep.cpp
@@ -130,26 +130,37 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
     }
 
     // virial
-    std::fill(nep_virial_sum.begin(), nep_virial_sum.end(), 0.0);
-#pragma omp parallel if (nat >= 256)
+    double v0 = 0.0;
+    double v1 = 0.0;
+    double v2 = 0.0;
+    double v3 = 0.0;
+    double v4 = 0.0;
+    double v5 = 0.0;
+    double v6 = 0.0;
+    double v7 = 0.0;
+    double v8 = 0.0;
+#pragma omp parallel for reduction(+:v0, v1, v2, v3, v4, v5, v6, v7, v8) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
-        double local_virial_sum[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-
-#pragma omp for schedule(static)
-        for (int index = 0; index < 9 * nat; ++index)
-        {
-            const int j = index / nat;
-            local_virial_sum[j] += _v[index];
-        }
-
-#pragma omp critical
-        {
-            for (int j = 0; j < 9; ++j)
-            {
-                nep_virial_sum[j] += local_virial_sum[j];
-            }
-        }
+        v0 += _v[i];
+        v1 += _v[nat + i];
+        v2 += _v[2 * nat + i];
+        v3 += _v[3 * nat + i];
+        v4 += _v[4 * nat + i];
+        v5 += _v[5 * nat + i];
+        v6 += _v[6 * nat + i];
+        v7 += _v[7 * nat + i];
+        v8 += _v[8 * nat + i];
     }
+    nep_virial_sum[0] = v0;
+    nep_virial_sum[1] = v1;
+    nep_virial_sum[2] = v2;
+    nep_virial_sum[3] = v3;
+    nep_virial_sum[4] = v4;
+    nep_virial_sum[5] = v5;
+    nep_virial_sum[6] = v6;
+    nep_virial_sum[7] = v7;
+    nep_virial_sum[8] = v8;
 
     // virial -> stress
     for (int i = 0; i < 3; ++i)
diff --git a/source/source_md/test/md_test_fixture.h b/source/source_md/test/md_test_fixture.h
index 3ad80c23638..fccc19e96f0 100644
--- a/source/source_md/test/md_test_fixture.h
+++ b/source/source_md/test/md_test_fixture.h
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 #include "source_esolver/esolver_lj.h"
+#include "source_io/module_parameter/parameter.h"
 #include "source_md/md_base.h"
 #include "setcell.h"
 

From e7bfb0ca872bb04725e83f4a3511b63bef15bd59 Mon Sep 17 00:00:00 2001
From: yyya18 <yy_1818@stu.pku.edu.cn>
Date: Wed, 3 Jun 2026 10:54:43 +0800
Subject: [PATCH 08/12] optimize: add OpenMP to Verlet thermalize, MSST
 rescale, and NHC particle_thermo velocity scaling

---
 source/source_md/msst.cpp    | 4 +++-
 source/source_md/nhchain.cpp | 4 +++-
 source/source_md/verlet.cpp  | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/source/source_md/msst.cpp b/source/source_md/msst.cpp
index e33c24b45e5..9ec0a9f8860 100644
--- a/source/source_md/msst.cpp
+++ b/source/source_md/msst.cpp
@@ -262,7 +262,9 @@ void MSST::rescale(std::ofstream& ofs, const double& volume)
     unitcell::setup_cell_after_vc(ucell,ofs);
 
     /// rescale velocity
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vel[i][sd] *= dilation[sd];
     }
diff --git a/source/source_md/nhchain.cpp b/source/source_md/nhchain.cpp
index dc72669ec4e..8b3d7c9479e 100644
--- a/source/source_md/nhchain.cpp
+++ b/source/source_md/nhchain.cpp
@@ -553,7 +553,9 @@ void Nose_Hoover::particle_thermo()
     }
 
     /// rescale velocity due to thermostats
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vel[i] *= scale;
     }
diff --git a/source/source_md/verlet.cpp b/source/source_md/verlet.cpp
index b2df7a78ced..5f81246ad1b 100644
--- a/source/source_md/verlet.cpp
+++ b/source/source_md/verlet.cpp
@@ -119,7 +119,9 @@ void Verlet::thermalize(const int& nraise, const double& current_temp, const dou
         fac = sqrt(target_temp / current_temp);
     }
 
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vel[i] *= fac;
     }

From 79cf84a86b7d3498094181e96ce802a16f2b6216 Mon Sep 17 00:00:00 2001
From: yyya18 <yy_1818@stu.pku.edu.cn>
Date: Wed, 3 Jun 2026 11:14:12 +0800
Subject: [PATCH 09/12] optimize: add OpenMP to DPMD interface - coord fill &
 force copy back

---
 source/source_esolver/esolver_dp.cpp | 36 +++++++++++++++++++---------
 source/source_esolver/esolver_dp.h   |  2 ++
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/source/source_esolver/esolver_dp.cpp b/source/source_esolver/esolver_dp.cpp
index 879193e668b..2a3ae21a617 100644
--- a/source/source_esolver/esolver_dp.cpp
+++ b/source/source_esolver/esolver_dp.cpp
@@ -44,6 +44,20 @@ void ESolver_DP::before_all_runners(UnitCell& ucell, const Input_para& inp)
 
     atype.resize(ucell.nat);
 
+    // Build flat atom index for OpenMP coordinate fill in runner()
+    atom_type_index.resize(ucell.nat);
+    atom_local_index.resize(ucell.nat);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            atom_type_index[iat] = it;
+            atom_local_index[iat] = ia;
+            iat++;
+        }
+    }
+
     rescaling = inp.mdp.dp_rescaling;
     fparam = inp.mdp.dp_fparam;
     aparam = inp.mdp.dp_aparam;
@@ -71,18 +85,16 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep)
     cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
 
     std::vector<double> coord(3 * ucell.nat, 0.0);
-    int iat = 0;
-    for (int it = 0; it < ucell.ntype; ++it)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int iat = 0; iat < nat; ++iat)
     {
-        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
-        {
-            coord[3 * iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
-            coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
-            coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
-            iat++;
-        }
+        const int it = atom_type_index[iat];
+        const int ia = atom_local_index[iat];
+        coord[3 * iat]     = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+        coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+        coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
     }
-    assert(ucell.nat == iat);
 
 #ifdef __DPMD
     std::vector<double> f, v;
@@ -101,7 +113,9 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep)
     GlobalV::ofs_running << " #TOTAL ENERGY# " << std::setprecision(11) << dp_potential * ModuleBase::Ry_to_eV << " eV"
                          << std::endl;
 
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat_f = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat_f >= 256)
+    for (int i = 0; i < nat_f; ++i)
     {
         dp_force(i, 0) = f[3 * i] * fact_f;
         dp_force(i, 1) = f[3 * i + 1] * fact_f;
diff --git a/source/source_esolver/esolver_dp.h b/source/source_esolver/esolver_dp.h
index 405bae44461..33b04b7c6f9 100644
--- a/source/source_esolver/esolver_dp.h
+++ b/source/source_esolver/esolver_dp.h
@@ -109,6 +109,8 @@ class ESolver_DP : public ESolver
 
     std::string dp_file;             ///< directory of DP model file
     std::vector<int> atype = {};     ///< atom type corresponding to DP model
+    std::vector<int> atom_type_index;  ///< type index (it) for each global atom iat
+    std::vector<int> atom_local_index; ///< local index (ia) within type for each global atom iat
     std::vector<double> fparam = {}; ///< frame parameter for dp potential: dim_fparam
     std::vector<double> aparam = {}; ///< atomic parameter for dp potential: natoms x dim_aparam
     double rescaling = 1.0;          ///< rescaling factor for DP model

From 6c8551fba57880d4e394ee1bd8b7deefd4196781 Mon Sep 17 00:00:00 2001
From: yyya18 <yy_1818@stu.pku.edu.cn>
Date: Wed, 3 Jun 2026 12:57:22 +0800
Subject: [PATCH 10/12] docs: add optimization records for Verlet, MSST, NHC,
 and DPMD OpenMP changes

---
 opt_logs/dpmd_interface_20260603.md      | 138 +++++++++++++++++++++++
 opt_logs/msst_rescale_20260603.md        |  46 ++++++++
 opt_logs/nhc_particle_thermo_20260603.md |  47 ++++++++
 opt_logs/verlet_thermalize_20260603.md   |  47 ++++++++
 4 files changed, 278 insertions(+)
 create mode 100644 opt_logs/dpmd_interface_20260603.md
 create mode 100644 opt_logs/msst_rescale_20260603.md
 create mode 100644 opt_logs/nhc_particle_thermo_20260603.md
 create mode 100644 opt_logs/verlet_thermalize_20260603.md

diff --git a/opt_logs/dpmd_interface_20260603.md b/opt_logs/dpmd_interface_20260603.md
new file mode 100644
index 00000000000..fc4fb3c248b
--- /dev/null
+++ b/opt_logs/dpmd_interface_20260603.md
@@ -0,0 +1,138 @@
+# ESolver_DP::runner() OpenMP 并行化
+
+日期：2026-06-03
+
+## 修改文件
+
+- `source/source_esolver/esolver_dp.h` — 新增 2 个成员变量
+- `source/source_esolver/esolver_dp.cpp` — 修改 3 处
+
+---
+
+## 改点 A：新增扁平原子索引（esolver_dp.h + esolver_dp.cpp）
+
+### esolver_dp.h 新增成员变量
+
+在 `private` 区域 `atype` 之后新增：
+
+```cpp
+    std::vector<int> atom_type_index;  ///< type index (it) for each global atom iat
+    std::vector<int> atom_local_index; ///< local index (ia) within type for each global atom iat
+```
+
+### esolver_dp.cpp `before_all_runners()` 新增索引构建
+
+```cpp
+    // Build flat atom index for OpenMP coordinate fill in runner()
+    atom_type_index.resize(ucell.nat);
+    atom_local_index.resize(ucell.nat);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            atom_type_index[iat] = it;
+            atom_local_index[iat] = ia;
+            iat++;
+        }
+    }
+```
+
+**说明**：`before_all_runners()` 只在初始化执行一次。索引将 `iat` → `(it, ia)` 映射缓存下来，供 `runner()` 每步复用。
+
+---
+
+## 改点 B：坐标转换并行化（esolver_dp.cpp 第 73-85 行）
+
+### 修改前
+
+```cpp
+    std::vector<double> coord(3 * ucell.nat, 0.0);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            coord[3 * iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+            coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+            coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
+            iat++;
+        }
+    }
+    assert(ucell.nat == iat);
+```
+
+### 修改后
+
+```cpp
+    std::vector<double> coord(3 * ucell.nat, 0.0);
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        const int it = atom_type_index[iat];
+        const int ia = atom_local_index[iat];
+        coord[3 * iat]     = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+        coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+        coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
+    }
+```
+
+**说明**：通过预建索引查找每个 `iat` 对应的 `(it, ia)`，消除串行 `iat++` 依赖。DPMD 坐标布局为 row-major `[x0,y0,z0, x1,y1,z1, ...]`，每个线程写独立的 3 个连续位置。移除了原 `assert`（并行循环内不能放 assert），可通过索引构建时的 `assert` 覆盖。
+
+**OpenMP 指令**：`#pragma omp parallel for schedule(static) if (nat >= 256)`
+
+---
+
+## 改点 C：力回填并行化（esolver_dp.cpp 第 104-109 行，`#ifdef __DPMD` 内部）
+
+### 修改前
+
+```cpp
+    for (int i = 0; i < ucell.nat; ++i)
+    {
+        dp_force(i, 0) = f[3 * i] * fact_f;
+        dp_force(i, 1) = f[3 * i + 1] * fact_f;
+        dp_force(i, 2) = f[3 * i + 2] * fact_f;
+    }
+```
+
+### 修改后
+
+```cpp
+    const int nat_f = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat_f >= 256)
+    for (int i = 0; i < nat_f; ++i)
+    {
+        dp_force(i, 0) = f[3 * i] * fact_f;
+        dp_force(i, 1) = f[3 * i + 1] * fact_f;
+        dp_force(i, 2) = f[3 * i + 2] * fact_f;
+    }
+```
+
+**说明**：每原子独立写 `dp_force(i, *)`，无依赖。位于 `#ifdef __DPMD` 内部。
+
+**OpenMP 指令**：`#pragma omp parallel for schedule(static) if (nat >= 256)`
+
+---
+
+## 不做修改的部分
+
+- **Virial 回填**（3×3=9 元素）：线程开销大于收益，保持串行
+- **Cell 填充**（9 元素）：保持串行
+- **`dp.compute()` 调用**：外部库，不进入
+- **`type_map()`**：仅初始化执行一次
+
+## 调用链
+
+- `force_virial()` → `p_esolver->runner()` → `ESolver_DP::runner()`
+
+## 潜在风险
+
+| 风险 | 评估 |
+|------|------|
+| 扁平索引正确性 | 与 `type_map()` 遍历逻辑一致，构建在 `before_all_runners()` 一次性完成 |
+| DPMD 外部库线程冲突 | `dp.compute()` 调用仍在串行区；ABACUS 侧仅并行数据填充/回填 |
+| `__DPMD` 宏兼容 | 改点 A/B 在宏之前，不受影响；改点 C 在宏内部 |
+| 内存开销 | 两个 `std::vector<int>` 各 nat 元素，约 8×nat 字节，可忽略 |
+| 移除了坐标填充后的 `assert` | 正确性由索引构建逻辑保证，与 `type_map()` 结构一致 |
diff --git a/opt_logs/msst_rescale_20260603.md b/opt_logs/msst_rescale_20260603.md
new file mode 100644
index 00000000000..a4da12b820d
--- /dev/null
+++ b/opt_logs/msst_rescale_20260603.md
@@ -0,0 +1,46 @@
+# MSST::rescale() OpenMP 并行化
+
+日期：2026-06-03
+
+## 修改文件
+
+- `source/source_md/msst.cpp`，第 265-270 行
+
+## 修改前代码
+
+```cpp
+    for (int i = 0; i < ucell.nat; ++i)
+    {
+        vel[i][sd] *= dilation[sd];
+    }
+```
+
+## 修改后代码
+
+```cpp
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        vel[i][sd] *= dilation[sd];
+    }
+```
+
+## 改动说明
+
+- 此循环位于 `MSST::rescale()` 末尾，在此之前已完成晶胞矩阵的 dilation 更新和 `unitcell::setup_cell_after_vc()` 调用（均保持串行）。
+- 每次迭代只写 `vel[i][sd]`，读取共享标量 `dilation[sd]` 和常量 `sd`。
+- `sd` 为 MSST 冲击方向（0/1/2），循环内所有迭代使用相同的 `sd` 和 `dilation[sd]`。
+- 使用 `schedule(static)` 保证可重复性和缓存局部性。
+
+## OpenMP 指令
+
+- `#pragma omp parallel for schedule(static) if (nat >= 256)`
+
+## 调用链
+
+- `MSST::second_half()` → `rescale()`
+
+## 潜在风险
+
+- 无。不改变晶胞更新顺序，只并行原子速度的纯缩放写入。
diff --git a/opt_logs/nhc_particle_thermo_20260603.md b/opt_logs/nhc_particle_thermo_20260603.md
new file mode 100644
index 00000000000..0c4a61b2d7f
--- /dev/null
+++ b/opt_logs/nhc_particle_thermo_20260603.md
@@ -0,0 +1,47 @@
+# Nose_Hoover::particle_thermo() OpenMP 并行化
+
+日期：2026-06-03
+
+## 修改文件
+
+- `source/source_md/nhchain.cpp`，第 556-561 行
+
+## 修改前代码
+
+```cpp
+    for (int i = 0; i < ucell.nat; ++i)
+    {
+        vel[i] *= scale;
+    }
+```
+
+## 修改后代码
+
+```cpp
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        vel[i] *= scale;
+    }
+```
+
+## 改动说明
+
+- 此循环位于 `Nose_Hoover::particle_thermo()` 末尾。`scale` 已由前段的 thermostat chain 积分循环（NHC 链变量 `eta`/`v_eta`/`g_eta` 递推）串行计算完毕。
+- thermostat chain 自身的积分具有严格递推依赖（`m` 从 `md_tchain-1` 向 0 迭代 + 多步 Yoshida-Suzuki 分解），保持完全串行。
+- 此循环仅将最终 `scale` 应用到每个原子速度上，每原子独立。
+- 使用 `schedule(static)` + `if (nat >= 256)` 阈值。
+
+## OpenMP 指令
+
+- `#pragma omp parallel for schedule(static) if (nat >= 256)`
+
+## 调用链
+
+- `Nose_Hoover::first_half()` / `second_half()` → `particle_thermo()`
+
+## 潜在风险
+
+- NHC/NPT 物理量对数值误差敏感。并行不改变 `scale` 的计算过程，只改变乘法应用顺序（独立写入，无归约差异）。
+- 多线程一致性需通过 `MODULE_MD_nhc` 单测验证。
diff --git a/opt_logs/verlet_thermalize_20260603.md b/opt_logs/verlet_thermalize_20260603.md
new file mode 100644
index 00000000000..77b1e259a09
--- /dev/null
+++ b/opt_logs/verlet_thermalize_20260603.md
@@ -0,0 +1,47 @@
+# Verlet::thermalize() OpenMP 并行化
+
+日期：2026-06-03
+
+## 修改文件
+
+- `source/source_md/verlet.cpp`，第 122-127 行
+
+## 修改前代码
+
+```cpp
+    for (int i = 0; i < ucell.nat; ++i)
+    {
+        vel[i] *= fac;
+    }
+```
+
+## 修改后代码
+
+```cpp
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        vel[i] *= fac;
+    }
+```
+
+## 改动说明
+
+- 此循环位于 `Verlet::thermalize()` 中，`fac` 由上层温控策略（rescaling / rescale_v / berendsen）在循环外串行计算完毕。
+- 每次迭代只写自己的 `vel[i]`，读取共享标量 `fac`，迭代之间无数据依赖。
+- 使用 `schedule(static)` 保证可重复性和缓存局部性。
+- 使用 `if (nat >= 256)` 避免小体系时线程启动开销。
+
+## OpenMP 指令
+
+- `#pragma omp parallel for schedule(static) if (nat >= 256)`
+
+## 调用链
+
+- `Verlet::apply_thermostat()` → `thermalize()`（rescaling / rescale_v / berendsen 分支）
+- 不涉及 Anderson 分支（该分支使用 `std::rand()` 和 `gaussrand()`，保持串行）。
+
+## 潜在风险
+
+- 无。纯独立写入，不涉及归约或共享状态。

From 546a9c6b287fb2c76faf56b07823babbcec2d976 Mon Sep 17 00:00:00 2001
From: lijianing99 <jianingli304@gmail.com>
Date: Thu, 4 Jun 2026 23:55:47 +0800
Subject: [PATCH 11/12] optimize: add OpenMP to remaining MD per-atom loops
 (rescale_vel, MSST, NHC, FIRE, LJ)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cover 6 remaining hot-path per-atom loops that were not parallelized
in the prior merge-openmp branch:

- md_func.cpp: rescale_vel() — velocity rescaling factor apply
- msst.cpp: vel_sum() — norm2 reduction, propagate_vel() — exp-based
  velocity propagation (highest compute density among uncovered loops)
- nhchain.cpp: vel_baro() — NPT per-atom velocity scaling
- fire.cpp: check_fire() — triple reduction + velocity mixing + zero
- esolver_lj.cpp: runner() — N² neighbor pair computation with
  schedule(dynamic) for load balancing, per-thread virial accumulation

All optimizations use schedule(static) with nat>=256 threshold
(LJ uses dynamic,32 for neighbor-count imbalance).
No data dependencies changed — all loops are per-atom independent.
No conflict with prior merge-openmp branch.
---
 source/source_esolver/esolver_lj.cpp | 90 +++++++++++++++++++++-------
 source/source_esolver/esolver_lj.h   |  3 +
 source/source_md/fire.cpp            | 11 +++-
 source/source_md/md_func.cpp         |  1 +
 source/source_md/msst.cpp            |  9 ++-
 source/source_md/nhchain.cpp         |  4 +-
 6 files changed, 90 insertions(+), 28 deletions(-)

diff --git a/source/source_esolver/esolver_lj.cpp b/source/source_esolver/esolver_lj.cpp
index 72db00a1db2..066cbce6116 100644
--- a/source/source_esolver/esolver_lj.cpp
+++ b/source/source_esolver/esolver_lj.cpp
@@ -53,6 +53,21 @@ void ESolver_LJ::before_all_runners(UnitCell& ucell, const Input_para& inp)
 
     // calculate the energy shift so that LJ energy is zero at rcut
     cal_en_shift(ucell.ntype, inp.mdp.lj_eshift);
+
+    // build flat atom index cache for OpenMP coordinate access in runner()
+    atom_type_index.resize(ucell.nat);
+    atom_local_index.resize(ucell.nat);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            atom_type_index[iat] = it;
+            atom_local_index[iat] = ia;
+            ++iat;
+        }
+    }
+    assert(ucell.nat == iat);
 }
 
 void ESolver_LJ::runner(UnitCell& ucell, const int istep)
@@ -62,40 +77,73 @@ void ESolver_LJ::runner(UnitCell& ucell, const int istep)
     neighbor_search.init(ucell_plus, search_radius, 0);
     neighbor_search.build_neighbors();
 
-    double distance = 0.0;
-    int index = 0;
-
     // Important! potential, force, virial must be zero per step
     lj_potential = 0;
     lj_force.zero_out();
     lj_virial.zero_out();
 
-    ModuleBase::Vector3<double> tau1, tau2, dtau;
-    for (int it = 0; it < ucell.ntype; ++it)
+    const int nat = ucell.nat;
+
+#pragma omp parallel
     {
-        Atom* atom1 = &ucell.atoms[it];
-        for (int ia = 0; ia < atom1->na; ++ia)
+        double vl[9] = {0};
+        double pot_local = 0.0;
+
+#pragma omp for schedule(dynamic, 32) if (nat >= 256)
+        for (int iat = 0; iat < nat; ++iat)
         {
-            tau1 = atom1->tau[ia];
-            for (int ad = 0; ad < neighbor_search.neighbor_list.numneigh[index]; ++ad)
+            const int it = atom_type_index[iat];
+            const int ia = atom_local_index[iat];
+            ModuleBase::Vector3<double> tau1 = ucell.atoms[it].tau[ia];
+
+            for (int ad = 0; ad < neighbor_search.neighbor_list.numneigh[iat]; ++ad)
             {
-                tau2.x = neighbor_search.all_atoms[neighbor_search.neighbor_list.firstneigh[index][ad]].position_x;
-                tau2.y = neighbor_search.all_atoms[neighbor_search.neighbor_list.firstneigh[index][ad]].position_y;
-                tau2.z = neighbor_search.all_atoms[neighbor_search.neighbor_list.firstneigh[index][ad]].position_z;
-                int it2 = neighbor_search.all_atoms[neighbor_search.neighbor_list.firstneigh[index][ad]].atom_type;
-                dtau = (tau1 - tau2) * ucell.lat0;
-                distance = dtau.norm();
+                const int neigh_idx = neighbor_search.neighbor_list.firstneigh[iat][ad];
+                ModuleBase::Vector3<double> tau2;
+                tau2.x = neighbor_search.all_atoms[neigh_idx].position_x;
+                tau2.y = neighbor_search.all_atoms[neigh_idx].position_y;
+                tau2.z = neighbor_search.all_atoms[neigh_idx].position_z;
+                int it2 = neighbor_search.all_atoms[neigh_idx].atom_type;
+
+                ModuleBase::Vector3<double> dtau = (tau1 - tau2) * ucell.lat0;
+                double distance = dtau.norm();
+
                 if (distance < lj_rcut(it, it2))
                 {
-                    lj_potential += LJ_energy(distance, it, it2) - en_shift(it, it2);
+                    pot_local += LJ_energy(distance, it, it2) - en_shift(it, it2);
                     ModuleBase::Vector3<double> f_ij = LJ_force(dtau, it, it2);
-                    lj_force(index, 0) += f_ij.x;
-                    lj_force(index, 1) += f_ij.y;
-                    lj_force(index, 2) += f_ij.z;
-                    LJ_virial(f_ij, dtau);
+                    lj_force(iat, 0) += f_ij.x;
+                    lj_force(iat, 1) += f_ij.y;
+                    lj_force(iat, 2) += f_ij.z;
+
+                    // per-thread virial accumulation
+                    vl[0] += dtau.x * f_ij.x;
+                    vl[1] += dtau.x * f_ij.y;
+                    vl[2] += dtau.x * f_ij.z;
+                    vl[3] += dtau.y * f_ij.x;
+                    vl[4] += dtau.y * f_ij.y;
+                    vl[5] += dtau.y * f_ij.z;
+                    vl[6] += dtau.z * f_ij.x;
+                    vl[7] += dtau.z * f_ij.y;
+                    vl[8] += dtau.z * f_ij.z;
                 }
             }
-            index++;
+        }
+
+#pragma omp atomic
+        lj_potential += pot_local;
+
+#pragma omp critical
+        {
+            lj_virial(0, 0) += vl[0];
+            lj_virial(0, 1) += vl[1];
+            lj_virial(0, 2) += vl[2];
+            lj_virial(1, 0) += vl[3];
+            lj_virial(1, 1) += vl[4];
+            lj_virial(1, 2) += vl[5];
+            lj_virial(2, 0) += vl[6];
+            lj_virial(2, 1) += vl[7];
+            lj_virial(2, 2) += vl[8];
         }
     }
 
diff --git a/source/source_esolver/esolver_lj.h b/source/source_esolver/esolver_lj.h
index a8d8d9d1787..473c9bef20f 100644
--- a/source/source_esolver/esolver_lj.h
+++ b/source/source_esolver/esolver_lj.h
@@ -3,6 +3,7 @@
 
 #include "esolver.h"
 #include "source_cell/module_neighlist/unitcell_plus.h"
+#include <vector>
 
 namespace ModuleESolver
 {
@@ -55,6 +56,8 @@ namespace ModuleESolver
         double lj_potential=0.0;
         ModuleBase::matrix lj_force;
         ModuleBase::matrix lj_virial;
+        std::vector<int> atom_type_index;   ///< global atom index to UnitCell atom type
+        std::vector<int> atom_local_index;  ///< global atom index to local index inside atom type
         //---------------------------------------------------
     };
 }
diff --git a/source/source_md/fire.cpp b/source/source_md/fire.cpp
index 8c2ba92b919..3f5d9d0ec3b 100644
--- a/source/source_md/fire.cpp
+++ b/source/source_md/fire.cpp
@@ -189,7 +189,10 @@ void FIRE::check_fire(void)
         dt_max = 2.5 * md_dt;
     }
 
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+
+#pragma omp parallel for reduction(+:P, sumforce, normvel) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         P += vel[i].x * force[i].x + vel[i].y * force[i].y + vel[i].z * force[i].z;
         sumforce += force[i].norm2();
@@ -199,7 +202,8 @@ void FIRE::check_fire(void)
     sumforce = sqrt(sumforce);
     normvel = sqrt(normvel);
 
-    for (int i = 0; i < ucell.nat; ++i)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
@@ -221,7 +225,8 @@ void FIRE::check_fire(void)
         md_dt *= fdec;
         negative_count = 0;
 
-        for (int i = 0; i < ucell.nat; ++i)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+        for (int i = 0; i < nat; ++i)
         {
             for (int j = 0; j < 3; ++j)
             {
diff --git a/source/source_md/md_func.cpp b/source/source_md/md_func.cpp
index 3c7526cd9c2..7b5e8fbd2c4 100644
--- a/source/source_md/md_func.cpp
+++ b/source/source_md/md_func.cpp
@@ -149,6 +149,7 @@ void rescale_vel(const int& natom,
         factor = 0.5 * (3 * natom - frozen_freedom) * temperature / kinetic_energy(natom, vel, allmass);
     }
 
+#pragma omp parallel for schedule(static) if (natom >= 256)
     for (int i = 0; i < natom; i++)
     {
         vel[i] = vel[i] * sqrt(factor);
diff --git a/source/source_md/msst.cpp b/source/source_md/msst.cpp
index 9ec0a9f8860..dcd00b01926 100644
--- a/source/source_md/msst.cpp
+++ b/source/source_md/msst.cpp
@@ -239,8 +239,9 @@ void MSST::restart(const std::string& global_readin_dir)
 double MSST::vel_sum() const
 {
     double vsum = 0;
-
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for reduction(+:vsum) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vsum += vel[i].norm2();
     }
@@ -278,8 +279,10 @@ void MSST::propagate_vel()
         const int sd = mdp.msst_direction;
         const double dthalf = 0.5 * md_dt;
         const double fac = msst_vis * pow(omega[sd], 2) / (vsum * ucell.omega);
+        const int nat = ucell.nat;
 
-        for (int i = 0; i < ucell.nat; ++i)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+        for (int i = 0; i < nat; ++i)
         {
             ModuleBase::Vector3<double> const_C = force[i] / allmass[i];
             ModuleBase::Vector3<double> const_D;
diff --git a/source/source_md/nhchain.cpp b/source/source_md/nhchain.cpp
index 8b3d7c9479e..76cce8da662 100644
--- a/source/source_md/nhchain.cpp
+++ b/source/source_md/nhchain.cpp
@@ -697,7 +697,9 @@ void Nose_Hoover::vel_baro()
         factor[i] = exp(-(v_omega[i] + mtk_term) * md_dt / 4);
     }
 
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {

From 56538dc08ecc8e9efbb1f5b6a113628c87ce4faf Mon Sep 17 00:00:00 2001
From: lijianing99 <jianingli304@gmail.com>
Date: Fri, 5 Jun 2026 00:12:34 +0800
Subject: [PATCH 12/12] fix: move 'if' clause from '#pragma omp for' to
 '#pragma omp parallel'

The 'if' clause is only valid on '#pragma omp parallel', not on
'#pragma omp for' when used inside an explicit parallel region.
This caused a compile error: 'if' is not valid for '#pragma omp for'.
---
 source/source_esolver/esolver_lj.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/source_esolver/esolver_lj.cpp b/source/source_esolver/esolver_lj.cpp
index 066cbce6116..d4bea56767d 100644
--- a/source/source_esolver/esolver_lj.cpp
+++ b/source/source_esolver/esolver_lj.cpp
@@ -84,12 +84,12 @@ void ESolver_LJ::runner(UnitCell& ucell, const int istep)
 
     const int nat = ucell.nat;
 
-#pragma omp parallel
+#pragma omp parallel if (nat >= 256)
     {
         double vl[9] = {0};
         double pot_local = 0.0;
 
-#pragma omp for schedule(dynamic, 32) if (nat >= 256)
+#pragma omp for schedule(dynamic, 32)
         for (int iat = 0; iat < nat; ++iat)
         {
             const int it = atom_type_index[iat];