From 5626b9d69225d30b0da8123ba63bea5d5849f259 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 21 Apr 2026 18:05:14 +0200 Subject: [PATCH 1/3] Add Float16 support for xsimd --- .../descriptions/Float16SupportInXsimd.md | 58 +++++++++++++++++++ src/components/fundable/projectsDetails.ts | 13 +++++ .../Float16SupportInXsimd/GetAQuote.tsx | 9 +++ .../fundable/Float16SupportInXsimd/index.tsx | 9 +++ 4 files changed, 89 insertions(+) create mode 100644 src/components/fundable/descriptions/Float16SupportInXsimd.md create mode 100644 src/pages/fundable/Float16SupportInXsimd/GetAQuote.tsx create mode 100644 src/pages/fundable/Float16SupportInXsimd/index.tsx diff --git a/src/components/fundable/descriptions/Float16SupportInXsimd.md b/src/components/fundable/descriptions/Float16SupportInXsimd.md new file mode 100644 index 00000000..7d635614 --- /dev/null +++ b/src/components/fundable/descriptions/Float16SupportInXsimd.md @@ -0,0 +1,58 @@ +# FP16 Support in xsimd + +xsimd is a C++ header-only library that abstracts SIMD (vectorization) intrinsics behind a single, generic API. +The same code — `xsimd::batch` — compiles to optimal machine code on x86 SSE/AVX, ARM NEON SVE, RISC-V, and WebAssembly, with no runtime overhead. +When an intrinsic is missing on a given target, xsimd falls back gracefully rather than failing or leaving the developer to write platform-specific branches. +This is why projects like Mozilla Firefox, Apache Arrow, Meta Velox, KDE Krita, and Pythran have adopted it as their vectorization layer. + +FP16 — the 16-bit half-precision floating point format — has become a first-class data type in modern computing. +It is the default storage format for large language model weights, the standard precision for neural network inference, +and increasingly the format of choice wherever memory bandwidth is the binding constraint. +Yet consuming or producing FP16 data from C++ SIMD code today requires writing painful, platform-specific intrinsics by hand. +xsimd currently has no FP16 support, forcing its users to drop out of the generic API the moment they touch half-precision data. + +We propose to add vectorized FP16 support to xsimd — native FP16 operations where hardware supports them, and correct fallbacks elsewhere. + +## Why FP16 Matters + +**Memory bandwidth is a bottleneck.** Modern CPUs and GPUs are not compute-bound — they are memory-bandwidth-bound. +FP16 cuts data size in half versus FP32. +This means twice as many values fit in cache, twice as many elements move per memory transaction, +and large working can perform more with L2 or L3 caches without accessing RAM. +The bandwidth saving alone, before any compute consideration, is the primary reason the format matters. + +**SIMD registers double the throughput.** With native arithmetic support, FP16 operation double the number +of floating point numbers processed per CPU cycle when precision is not an issue. + +**FP16 is widely used in AI.** Transformer weights, KV caches, activations, and embeddings are all routinely stored in FP16. +Any library that processes or pipelines this data at any point of the training and inference pipeline must be able to consume and produce FP16 buffers efficiently. +Without xsimd FP16 support, these projects become a limiting factor in an otheriwse highly optimized data transformation. + +## Hardware Landscape + +FP16 conversion and arithmetic are now widely available across all major SIMD families: +- **x86**: Early on, the `f16c` feature introduced SIMD convertion from FP16 and FP32 for efficient storage, while arithmetic would still be performed in FP32. + With the AVX-512 generation, support for doing operations directly in FP16 is introduce with sigificant speedups. +- **ARM**: FP16 support becomes mandatory in latest ARM generations (ARM v8.2-a) with arithmetic, convertion, *etc*. + This affects NEON operations on modern smartphones and all Apple silicon M-chips. + Coverage is extended server side with both SVE and SVE2 supporting FP16. + +## Proposed Work + +This proposal covers foundational FP16 support: native FP16 operations on platforms that provide hardware acceleration, and correct, efficient fallbacks everywhere else. + +Concretely, this means: +- A new `xsimd::batch` type (or equivalent half-precision batch specialization) that can be loaded from and stored to FP16 buffers. +- Support for converting from and to `batch`, mapping to the optimal hardware instruction where available, and a correct SIMD algorithm elsewhere. +- Native FP16 arithmetic operations — add, multiply, FMA, min, max, and comparison — on backends that provide hardware support, with FP32-based fallbacks on those that do not + +## Impact + +Funding this development will directly open xsimd to the rapidly growing landscape of LLM and machine +learning workflows: local inference engines, model weight processing, and embedding pipelines. + +Beyond new workloads, this will benefit existing projects using xsimd that already handle FP16 data. +For instance Apache Arrow and Parquet process half-precision columns today without hardware-optimized SIMD support. +These projects stand to benefit directly and with small integration effort. + +##### Are you interested in this project? Either entirely or partially, contact us for more information on how to help us fund it. diff --git a/src/components/fundable/projectsDetails.ts b/src/components/fundable/projectsDetails.ts index a28e1efd..76a64d48 100644 --- a/src/components/fundable/projectsDetails.ts +++ b/src/components/fundable/projectsDetails.ts @@ -3,6 +3,7 @@ import JupyterGISRasterProcessingMD from "@site/src/components/fundable/descript import JupyterGISToolsForPythonAPIMD from "@site/src/components/fundable/descriptions/JupyterGISToolsForPythonAPI.md" import EmscriptenForgePackageRequestsMD from "@site/src/components/fundable/descriptions/EmscriptenForgePackageRequests.md" import SVE2SupportInXsimdMD from "@site/src/components/fundable/descriptions/SVE2SupportInXsimd.md" +import Float16SupportInXsimdMD from "@site/src/components/fundable/descriptions/Float16SupportInXsimd.md" import MatrixOperationsInXtensorMD from "@site/src/components/fundable/descriptions/MatrixOperationsInXtensor.md" import BinaryViewInArrowCppMD from "@site/src/components/fundable/descriptions/BinaryViewInArrowCpp.md" import Decimal32InArrowCppMD from"@site/src/components/fundable/descriptions/Decimal32InArrowCpp.md" @@ -75,6 +76,18 @@ export const fundableProjectsDetails = { currentFundingPercentage: 0, repoLink: "https://github.com/xtensor-stack/xsimd" }, + { + category: "Scientific Computing", + title: "Float16 support in xsimd", + pageName: "Float16SupportInXsimd", + shortDescription: "xsimd is a C++ scientific library that abstract low-level high performances computing primitives across different hardwares. We will add vectorized support for half-precision 16 bits float operations where hardware supports them, and correct fallbacks elsewhere.", + description: Float16SupportInXsimdMD, + price: "20 000 €", + maxNbOfFunders: 2, + currentNbOfFunders: 0, + currentFundingPercentage: 0, + repoLink: "https://github.com/xtensor-stack/xsimd" + }, { category: "Scientific Computing", title: "Implementing Kazushige Goto Algorithms for Matrix Operations in xtensor", diff --git a/src/pages/fundable/Float16SupportInXsimd/GetAQuote.tsx b/src/pages/fundable/Float16SupportInXsimd/GetAQuote.tsx new file mode 100644 index 00000000..963582a8 --- /dev/null +++ b/src/pages/fundable/Float16SupportInXsimd/GetAQuote.tsx @@ -0,0 +1,9 @@ +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import GetAQuotePage from '@site/src/components/fundable/GetAQuotePage'; + +export default function FundablePage() { + const { siteConfig } = useDocusaurusContext(); + return ( + + ); +} diff --git a/src/pages/fundable/Float16SupportInXsimd/index.tsx b/src/pages/fundable/Float16SupportInXsimd/index.tsx new file mode 100644 index 00000000..876857af --- /dev/null +++ b/src/pages/fundable/Float16SupportInXsimd/index.tsx @@ -0,0 +1,9 @@ +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import LargeProjectCardPage from '@site/src/components/fundable/LargeProjectCardPage'; + +export default function FundablePage() { + const { siteConfig } = useDocusaurusContext(); + return ( + + ); +} From ed54efcb072736c3d6e998e6932f61e3e0379067 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 21 Apr 2026 18:32:26 +0200 Subject: [PATCH 2/3] Fix titles --- .../fundable/descriptions/Float16SupportInXsimd.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/components/fundable/descriptions/Float16SupportInXsimd.md b/src/components/fundable/descriptions/Float16SupportInXsimd.md index 7d635614..4a5f576f 100644 --- a/src/components/fundable/descriptions/Float16SupportInXsimd.md +++ b/src/components/fundable/descriptions/Float16SupportInXsimd.md @@ -1,6 +1,6 @@ -# FP16 Support in xsimd +#### Overview -xsimd is a C++ header-only library that abstracts SIMD (vectorization) intrinsics behind a single, generic API. +Xsimd is a C++ header-only library that abstracts SIMD (vectorization) intrinsics behind a single, generic API. The same code — `xsimd::batch` — compiles to optimal machine code on x86 SSE/AVX, ARM NEON SVE, RISC-V, and WebAssembly, with no runtime overhead. When an intrinsic is missing on a given target, xsimd falls back gracefully rather than failing or leaving the developer to write platform-specific branches. This is why projects like Mozilla Firefox, Apache Arrow, Meta Velox, KDE Krita, and Pythran have adopted it as their vectorization layer. @@ -13,7 +13,7 @@ xsimd currently has no FP16 support, forcing its users to drop out of the generi We propose to add vectorized FP16 support to xsimd — native FP16 operations where hardware supports them, and correct fallbacks elsewhere. -## Why FP16 Matters +#### Why FP16 Matters **Memory bandwidth is a bottleneck.** Modern CPUs and GPUs are not compute-bound — they are memory-bandwidth-bound. FP16 cuts data size in half versus FP32. @@ -37,7 +37,7 @@ FP16 conversion and arithmetic are now widely available across all major SIMD fa This affects NEON operations on modern smartphones and all Apple silicon M-chips. Coverage is extended server side with both SVE and SVE2 supporting FP16. -## Proposed Work +#### Proposed Work This proposal covers foundational FP16 support: native FP16 operations on platforms that provide hardware acceleration, and correct, efficient fallbacks everywhere else. @@ -46,7 +46,7 @@ Concretely, this means: - Support for converting from and to `batch`, mapping to the optimal hardware instruction where available, and a correct SIMD algorithm elsewhere. - Native FP16 arithmetic operations — add, multiply, FMA, min, max, and comparison — on backends that provide hardware support, with FP32-based fallbacks on those that do not -## Impact +#### Impact Funding this development will directly open xsimd to the rapidly growing landscape of LLM and machine learning workflows: local inference engines, model weight processing, and embedding pipelines. From da293ff5436aece4af3ce1a521133369425eee25 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 21 Apr 2026 18:35:51 +0200 Subject: [PATCH 3/3] Fix grammar --- src/components/fundable/projectsDetails.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/components/fundable/projectsDetails.ts b/src/components/fundable/projectsDetails.ts index 76a64d48..98e04541 100644 --- a/src/components/fundable/projectsDetails.ts +++ b/src/components/fundable/projectsDetails.ts @@ -68,7 +68,7 @@ export const fundableProjectsDetails = { category: "Scientific Computing", title: "SVE2 support in xsimd", pageName: "SVE2SupportInXsimd", - shortDescription: "xsimd is a C++ scientific library that abstract low-level high performances computing primitives across different hardwares. We will add support for the latest SVE2 generation of ARM CPUs.", + shortDescription: "xsimd is a C++ scientific library that abstracts low-level high performances computing primitives across different hardwares. We will add support for the latest SVE2 generation of ARM CPUs.", description: SVE2SupportInXsimdMD, price: "30 000 €", maxNbOfFunders: 2, @@ -80,7 +80,7 @@ export const fundableProjectsDetails = { category: "Scientific Computing", title: "Float16 support in xsimd", pageName: "Float16SupportInXsimd", - shortDescription: "xsimd is a C++ scientific library that abstract low-level high performances computing primitives across different hardwares. We will add vectorized support for half-precision 16 bits float operations where hardware supports them, and correct fallbacks elsewhere.", + shortDescription: "xsimd is a C++ scientific library that abstracts low-level high performances computing primitives across different hardwares. We will add vectorized support for half-precision 16 bits float operations where hardware supports them, and correct fallbacks elsewhere.", description: Float16SupportInXsimdMD, price: "20 000 €", maxNbOfFunders: 2,