diff --git a/NOTICE b/NOTICE index 31787fecbf0..c50ea163f76 100644 --- a/NOTICE +++ b/NOTICE @@ -118,3 +118,10 @@ LS-HPACK provides functionality to encode and decode HTTP headers using HPACK compression mechanism specified in RFC 7541. Copyright (c) 2018 - 2023 LiteSpeed Technologies Inc, (MIT License) https://github.com/litespeedtech/ls-hpack.git + +~~ + +include/tscore/ink_ascii_tolower.h AVX-512BW kernel design (fused +mask_add and masked-tail load/store) is adapted from Tony Finch's +copytolower64.c (0BSD OR MIT-0). +https://dotat.at/cgi/git/vectolower.git/ diff --git a/include/tscore/ink_ascii_tolower.h b/include/tscore/ink_ascii_tolower.h new file mode 100644 index 00000000000..d98aafb48dc --- /dev/null +++ b/include/tscore/ink_ascii_tolower.h @@ -0,0 +1,185 @@ +/** @file + + SIMD-accelerated bulk ASCII tolower copy. + + Used on the URL canonicalization fast path for cache-key digests + (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast) and any other place + that needs to fold ASCII to lowercase over a small-to-moderate + buffer. The scalar byte-at-a-time loop is the bottleneck for hosts + and schemes long enough to vectorize; for shorter inputs the scalar + tail handles them with no SIMD overhead. + + Semantics match a byte-at-a-time loop using ParseRules::ink_tolower(): + + - Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to + 'a'..'z'. All other bytes (including 0x80..0xFF) pass through + unchanged. There is no UTF-8 case folding. + + - In-place use (dst == src) is supported on every path. Each SIMD + body loads a full block into a register before storing back at + the same offset, and the AVX-512BW masked tail does masked-load + / masked-store at the same offset. Partial overlap where + dst != src but the ranges intersect is not supported. + + Implementation note: selection is purely compile-time; no runtime + dispatch. Bodies are stacked widest-first. + + - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the + bulk and a single masked load/store finishes any 1..63-byte tail, + then we return. When n < 64, we fall through to the AVX2 + SSE2 + cascade below so tiny inputs avoid the masked tail's fixed setup + cost. + + - AVX2 builds: a 32-byte main loop drains to a 16-byte SSE2 step + and then to a scalar tail of 0..15 bytes. + + - SSE2 / NEON builds: a single 16-byte main loop drains to a + scalar tail. + + - Other targets: scalar only. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +#pragma once + +#include +#include + +#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__) +#include +#elif defined(__ARM_NEON) || defined(__aarch64__) +#include +#endif + +namespace ts::ascii +{ + +inline void +tolower_copy(char *dst, const char *src, std::size_t n) noexcept +{ +#if defined(__AVX512BW__) + // AVX-512BW: 64 bytes per iteration with two key optimizations over the + // narrower paths: + // - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single + // op (no separate maskz_set1 + or). + // - A masked load/store handles the 1..63-byte tail in a single SIMD + // pass, so we don't need to cascade to AVX2/SSE2 to drain the + // remainder. + // + // The masked tail does carry ~7 ns of fixed setup cost, which loses to + // the cascade on short inputs. Gating the whole block on n >= 64 means + // tiny inputs fall through to the AVX2/SSE2 path below, where they keep + // the speedup that path already provides. + // + // Adapted from Tony Finch's copytolower64.c (see NOTICE). + if (n >= 64) { + const __m512i A_vec = _mm512_set1_epi8('A'); + const __m512i Z_vec = _mm512_set1_epi8('Z'); + const __m512i delta = _mm512_set1_epi8('a' - 'A'); + do { + __m512i bytes = _mm512_loadu_epi8(src); + __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec); + _mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta)); + src += 64; + dst += 64; + n -= 64; + } while (n >= 64); + if (n != 0) { + auto len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n)); + __m512i bytes = _mm512_maskz_loadu_epi8(len_mask, src); + __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec); + _mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta)); + } + return; + } +#endif + +#if defined(__AVX2__) + // 32 bytes per iteration. Same compare-and-OR pattern as SSE2. + { + const __m256i a_minus_one = _mm256_set1_epi8('A' - 1); + const __m256i z_plus_one = _mm256_set1_epi8('Z' + 1); + const __m256i bit5 = _mm256_set1_epi8(0x20); + while (n >= 32) { + __m256i bytes = _mm256_loadu_si256(reinterpret_cast(src)); + __m256i ge_A = _mm256_cmpgt_epi8(bytes, a_minus_one); + __m256i le_Z = _mm256_cmpgt_epi8(z_plus_one, bytes); + __m256i mask = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask)); + src += 32; + dst += 32; + n -= 32; + } + } +#endif + +#if defined(__SSE2__) + // 16 bytes per iteration. Signed compare works for ASCII A-Z because all + // letters live below 0x80; high bytes (0x80..0xFF) compare as negative + // and correctly miss the [A,Z] range so they pass through unchanged. + { + const __m128i a_minus_one = _mm_set1_epi8('A' - 1); + const __m128i z_plus_one = _mm_set1_epi8('Z' + 1); + const __m128i bit5 = _mm_set1_epi8(0x20); + while (n >= 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(src)); + __m128i ge_A = _mm_cmpgt_epi8(bytes, a_minus_one); + __m128i le_Z = _mm_cmpgt_epi8(z_plus_one, bytes); + __m128i mask = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask)); + src += 16; + dst += 16; + n -= 16; + } + } +#elif defined(__ARM_NEON) || defined(__aarch64__) + // 16 bytes per iteration; unsigned compare available natively. + { + const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1); + const uint8x16_t z_plus_one = vdupq_n_u8('Z' + 1); + const uint8x16_t bit5 = vdupq_n_u8(0x20); + while (n >= 16) { + uint8x16_t bytes = vld1q_u8(reinterpret_cast(src)); + uint8x16_t ge_A = vcgtq_u8(bytes, a_minus_one); + uint8x16_t le_Z = vcltq_u8(bytes, z_plus_one); + uint8x16_t mask = vandq_u8(vandq_u8(ge_A, le_Z), bit5); + vst1q_u8(reinterpret_cast(dst), vorrq_u8(bytes, mask)); + src += 16; + dst += 16; + n -= 16; + } + } +#endif + + while (n--) { + auto c = static_cast(*src++); + *dst++ = (c >= 'A' && c <= 'Z') ? static_cast(c | 0x20) : static_cast(c); + } +} + +// Thin sugar over tolower_copy for the in-place case. Makes call sites +// like ts::ascii::tolower_inplace(buf, n) read naturally instead of +// ts::ascii::tolower_copy(buf, buf, n). +inline void +tolower_inplace(char *buf, std::size_t n) noexcept +{ + tolower_copy(buf, buf, n); +} + +} // namespace ts::ascii diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc index 68d84b5d481..1c9eb4170b2 100644 --- a/src/proxy/hdrs/URL.cc +++ b/src/proxy/hdrs/URL.cc @@ -25,6 +25,7 @@ #include #include "tscore/ink_platform.h" #include "tscore/ink_memory.h" +#include "tscore/ink_ascii_tolower.h" #include "proxy/hdrs/URL.h" #include "proxy/hdrs/MIME.h" #include "proxy/hdrs/HTTP.h" @@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */) * * ***********************************************************************/ -static inline void -memcpy_tolower(char *d, const char *s, int n) -{ - while (n--) { - *d = ParseRules::ink_tolower(*s); - s++; - d++; - } -} - // fast path for CryptoHash, HTTP, no user/password/params/query, // no buffer overflow, no unescaping needed @@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash char *p; p = buffer; - memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme); + ts::ascii::tolower_copy(p, url->m_ptr_scheme, url->m_len_scheme); p += url->m_len_scheme; *p++ = ':'; *p++ = '/'; @@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash *p++ = ':'; // no password *p++ = '@'; - memcpy_tolower(p, url->m_ptr_host, url->m_len_host); + ts::ascii::tolower_copy(p, url->m_ptr_host, url->m_len_host); p += url->m_len_host; *p++ = '/'; memcpy(p, url->m_ptr_path, url->m_len_path); diff --git a/src/proxy/hdrs/unit_tests/test_URL.cc b/src/proxy/hdrs/unit_tests/test_URL.cc index dc5ff4ade74..8dda4e9501a 100644 --- a/src/proxy/hdrs/unit_tests/test_URL.cc +++ b/src/proxy/hdrs/unit_tests/test_URL.cc @@ -659,6 +659,39 @@ std::vector get_hash_test_cases = { !IGNORE_QUERY, HAS_EQUAL_HASH, }, + { + // Verifies the scheme/host SIMD-tolower path in url_CryptoHash_get_fast: + // an uppercase host with a long enough prefix to hit the 16-byte SIMD + // body should hash identically to its lowercased form. + "Uppercase host: equal hashes", + "http://ONE.EXAMPLE.COM/a/path?name=value", + "http://one.example.com/a/path?name=value", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Mixed-case host: equal hashes", + "http://One.Example.Com/a/path?name=value", + "http://one.example.com/a/path?name=value", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Uppercase scheme: equal hashes", + "HTTP://one.example.com/a/path?name=value", + "http://one.example.com/a/path?name=value", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + // Long uppercase host crosses 16- and 32-byte SIMD body boundaries so + // the wider paths (when compiled in) are exercised by this fixture. + "Long uppercase host: equal hashes", + "http://A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM/a/path", + "http://a-very-long-host-name-for-simd.example.com/a/path", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, }; /** Return the hash related to a URI. diff --git a/src/proxy/http/remap/UrlRewrite.cc b/src/proxy/http/remap/UrlRewrite.cc index fbda217462b..d2af3b343e9 100644 --- a/src/proxy/http/remap/UrlRewrite.cc +++ b/src/proxy/http/remap/UrlRewrite.cc @@ -22,6 +22,8 @@ */ +#include "tscore/ink_ascii_tolower.h" + #include "proxy/http/remap/UrlRewrite.h" #include "proxy/http/remap/RemapYamlConfig.h" #include "iocore/eventsystem/ConfigProcessor.h" @@ -931,10 +933,7 @@ UrlRewrite::_mappingLookup(MappingsStore &mappings, URL *request_url, int reques return false; } - // lowercase - for (int i = 0; i < request_host_len; ++i) { - request_host_lower[i] = tolower(request_host[i]); - } + ts::ascii::tolower_copy(request_host_lower, request_host, request_host_len); request_host_lower[request_host_len] = 0; bool retval = false; diff --git a/src/proxy/http/remap/unit-tests/test_RemapRules.cc b/src/proxy/http/remap/unit-tests/test_RemapRules.cc index 012e8b55bd7..6f2dfc32668 100644 --- a/src/proxy/http/remap/unit-tests/test_RemapRules.cc +++ b/src/proxy/http/remap/unit-tests/test_RemapRules.cc @@ -225,3 +225,58 @@ map_with_recv_port http://front.example.com \ } } } + +SCENARIO("UrlRewrite host lookup is case-insensitive", "[proxy][remap]") +{ + // _mappingLookup lower-cases the request host before consulting the hash + // table; these scenarios exercise that path with inputs that would not + // match in a strict byte-compare. Sized to cross the 16-byte SSE2 body + // for hosts that get a real SIMD pass. + GIVEN("A forward map with a lowercase source host") + { + auto urlrw = std::make_unique(); + std::string config = R"RMCFG( +map http://www.example.com http://origin.example.com + )RMCFG"; + + auto cpath = write_test_remap(config, "case_insensitive"); + int rc = urlrw->BuildTable(cpath.c_str()); + REQUIRE(rc == TS_SUCCESS); + REQUIRE(urlrw->rule_count() == 1); + + EasyURL url("http://www.example.com"); + UrlMappingContainer urlmap; + + THEN("uppercase request host matches the lowercase rule") + { + const char *host = "WWW.EXAMPLE.COM"; + REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap)); + } + THEN("mixed-case request host matches the lowercase rule") + { + const char *host = "Www.Example.Com"; + REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap)); + } + } + + GIVEN("A forward map with a long host that exercises the 16-byte SIMD body") + { + auto urlrw = std::make_unique(); + std::string config = R"RMCFG( +map http://a-very-long-host-name-for-simd.example.com http://origin.example.com + )RMCFG"; + + auto cpath = write_test_remap(config, "case_insensitive_long"); + int rc = urlrw->BuildTable(cpath.c_str()); + REQUIRE(rc == TS_SUCCESS); + + EasyURL url("http://a-very-long-host-name-for-simd.example.com"); + UrlMappingContainer urlmap; + + THEN("an all-uppercase 49-char host (covers >=32 SIMD bytes) matches") + { + const char *host = "A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM"; + REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap)); + } + } +} diff --git a/src/proxy/http2/HPACK.cc b/src/proxy/http2/HPACK.cc index 7e4fd974f57..34ff607ced2 100644 --- a/src/proxy/http2/HPACK.cc +++ b/src/proxy/http2/HPACK.cc @@ -23,6 +23,7 @@ #include "proxy/http2/HPACK.h" +#include "tscore/ink_ascii_tolower.h" #include "tsutil/LocalBuffer.h" #include "swoc/TextView.h" @@ -789,9 +790,7 @@ hpack_encode_header_block(HpackIndexingTable &indexing_table, uint8_t *out_buf, int name_len = original_name.size(); ts::LocalBuffer local_buffer(name_len); char *lower_name = local_buffer.data(); - for (int i = 0; i < name_len; i++) { - lower_name[i] = ParseRules::ink_tolower(original_name[i]); - } + ts::ascii::tolower_copy(lower_name, original_name.data(), name_len); std::string_view name{lower_name, static_cast(name_len)}; std::string_view value = field.value_get(); diff --git a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc index ad373211fb8..7692931b1af 100644 --- a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc +++ b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc @@ -24,6 +24,7 @@ limitations under the License. */ +#include #include #include @@ -531,3 +532,34 @@ TEST_CASE("HPACK high level APIs", "[hpack]") } } } + +// Validates that hpack_encode_header_block() lower-cases mixed-case field +// names per RFC 7540 ยง 8.1.2 before emitting them. The lower-case step is the +// path that goes through ts::ascii::tolower_copy; if a regression broke the +// lowercasing, the byte-for-byte comparison below would fail. +TEST_CASE("HPACK encode lower-cases mixed-case field names", "[hpack]") +{ + uint8_t buf_mixed[BUFSIZE_FOR_REGRESSION_TEST]; + uint8_t buf_lower[BUFSIZE_FOR_REGRESSION_TEST]; + HpackIndexingTable table_mixed(MAX_TABLE_SIZE); + HpackIndexingTable table_lower(MAX_TABLE_SIZE); + + // Use a name long enough to exercise the 16-byte SSE2 body when present. + auto encode_one = [](HpackIndexingTable &table, uint8_t *buf, const char *name, const char *value) -> int64_t { + std::unique_ptr headers(new HTTPHdr, destroy_http_hdr); + headers->create(HTTPType::REQUEST); + MIMEField *field = mime_field_create(headers->m_heap, headers->m_http->m_fields_impl); + field->name_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{name}); + field->value_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{value}); + mime_hdr_field_attach(headers->m_http->m_fields_impl, field, 1, nullptr); + std::memset(buf, 0, BUFSIZE_FOR_REGRESSION_TEST); + return hpack_encode_header_block(table, buf, BUFSIZE_FOR_REGRESSION_TEST, headers.get()); + }; + + int64_t mixed_len = encode_one(table_mixed, buf_mixed, "Long-Custom-Header-Name", "abc"); + int64_t lower_len = encode_one(table_lower, buf_lower, "long-custom-header-name", "abc"); + + REQUIRE(mixed_len > 0); + REQUIRE(mixed_len == lower_len); + REQUIRE(std::memcmp(buf_mixed, buf_lower, lower_len) == 0); +} diff --git a/src/proxy/http3/QPACK.cc b/src/proxy/http3/QPACK.cc index dfdd2d278b3..92ac2b024f6 100644 --- a/src/proxy/http3/QPACK.cc +++ b/src/proxy/http3/QPACK.cc @@ -25,6 +25,7 @@ #include "proxy/hdrs/XPACK.h" #include "proxy/http3/QPACK.h" #include "tscore/ink_defs.h" +#include "tscore/ink_ascii_tolower.h" #include "tscore/ink_memory.h" #define QPACKDebug(fmt, ...) Dbg(dbg_ctl_qpack, "[%s] " fmt, this->_qc->cids().data(), ##__VA_ARGS__) @@ -369,9 +370,7 @@ QPACK::_encode_header(const MIMEField &field, uint16_t base_index, IOBufferBlock { auto name{field.name_get()}; char *lowered_name = this->_arena.str_store(name.data(), name.length()); - for (size_t i = 0; i < name.length(); i++) { - lowered_name[i] = ParseRules::ink_tolower(lowered_name[i]); - } + ts::ascii::tolower_inplace(lowered_name, name.length()); auto value{field.value_get()}; // TODO Set never_index flag on/off according to encoding headers diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt index 7790adc87dd..ceb770d7ecc 100644 --- a/src/tscore/CMakeLists.txt +++ b/src/tscore/CMakeLists.txt @@ -158,6 +158,7 @@ if(BUILD_TESTING) unit_tests/test_Throttler.cc unit_tests/test_Tokenizer.cc unit_tests/test_arena.cc + unit_tests/test_ink_ascii_tolower.cc unit_tests/test_ink_inet.cc unit_tests/test_ink_memory.cc unit_tests/test_ink_string.cc diff --git a/src/tscore/unit_tests/test_ink_ascii_tolower.cc b/src/tscore/unit_tests/test_ink_ascii_tolower.cc new file mode 100644 index 00000000000..5f2cb8df3c9 --- /dev/null +++ b/src/tscore/unit_tests/test_ink_ascii_tolower.cc @@ -0,0 +1,132 @@ +/** @file + + Unit tests for ts::ascii::tolower_copy and ts::ascii::tolower_inplace. + + Runs as part of the standard test_tscore binary so the helper's SIMD + and scalar paths are exercised by ctest in every build, not just when + ENABLE_BENCHMARKS is set. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include + +#include "tscore/ink_ascii_tolower.h" +#include "tscore/ParseRules.h" + +#include +#include +#include +#include + +namespace +{ + +// Same mixed-case ASCII distribution we use in the benchmark, so the unit +// tests exercise inputs that look like real URL/header bytes. +std::vector +make_mixed_case_ascii(std::size_t n, std::uint64_t seed) +{ + std::mt19937_64 rng(seed); + std::vector v(n); + for (std::size_t i = 0; i < n; ++i) { + auto r = static_cast(rng() & 0x3FU); + if (r < 26U) { + v[i] = static_cast('A' + r); + } else if (r < 52U) { + v[i] = static_cast('a' + (r - 26U)); + } else { + static constexpr char kNonAlpha[] = "0123456789-_./:"; + v[i] = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)]; + } + } + return v; +} + +// Byte-at-a-time reference, equivalent to the prior static-inline +// memcpy_tolower in URL.cc. Anything ts::ascii::tolower_copy produces must +// match this for every input we test. +void +tolower_reference(char *d, const char *s, std::size_t n) noexcept +{ + while (n--) { + *d = ParseRules::ink_tolower(*s); + ++s; + ++d; + } +} + +} // namespace + +TEST_CASE("ts::ascii::tolower_copy matches scalar reference", "[ts_ascii_tolower]") +{ + // Bracket every SIMD body width (16/32/64) with both equal-to and + // offset-from-multiple lengths so the cascade transitions and the + // AVX-512BW masked tail are all exercised. + for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) { + auto input = make_mixed_case_ascii(sz, 0xC0FFEE + sz); + std::vector expected(sz); + std::vector actual(sz); + + tolower_reference(expected.data(), input.data(), sz); + ts::ascii::tolower_copy(actual.data(), input.data(), sz); + + CAPTURE(sz); + REQUIRE(actual == expected); + } +} + +TEST_CASE("ts::ascii::tolower_copy preserves non-ASCII bytes", "[ts_ascii_tolower]") +{ + // Every byte value 0..255 should round-trip unchanged unless it is in + // 'A'..'Z', in which case it should map to 'a'..'z'. Guards against any + // future "speed-up" that widens the case-fold range past ASCII. + std::array input; + for (std::size_t i = 0; i < 256; ++i) { + input[i] = static_cast(i); + } + std::array output; + ts::ascii::tolower_copy(output.data(), reinterpret_cast(input.data()), input.size()); + + for (std::size_t i = 0; i < 256; ++i) { + auto in = static_cast(i); + auto out = static_cast(output[i]); + auto exp = (in >= 'A' && in <= 'Z') ? static_cast(in | 0x20) : in; + CAPTURE(i); + REQUIRE(out == exp); + } +} + +TEST_CASE("ts::ascii::tolower_inplace matches tolower_copy", "[ts_ascii_tolower]") +{ + // The inplace form must produce the same result as a non-overlapping copy. + // Exercise the same boundary sizes so the SIMD bodies and the AVX-512BW + // masked load/store are all exercised in-place. + for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) { + auto input = make_mixed_case_ascii(sz, 0xBADF00D + sz); + std::vector expected(sz); + std::vector in_place(input); + + tolower_reference(expected.data(), input.data(), sz); + ts::ascii::tolower_inplace(in_place.data(), sz); + + CAPTURE(sz); + REQUIRE(in_place == expected); + } +} diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt index 49f25fad1c1..c08e9a94d12 100644 --- a/tools/benchmark/CMakeLists.txt +++ b/tools/benchmark/CMakeLists.txt @@ -36,6 +36,9 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li add_executable(benchmark_Random benchmark_Random.cc) target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore) +add_executable(benchmark_ascii_tolower benchmark_ascii_tolower.cc) +target_link_libraries(benchmark_ascii_tolower PRIVATE Catch2::Catch2WithMain ts::tscore) + add_executable(benchmark_HostDB benchmark_HostDB.cc) target_link_libraries( benchmark_HostDB diff --git a/tools/benchmark/benchmark_ascii_tolower.cc b/tools/benchmark/benchmark_ascii_tolower.cc new file mode 100644 index 00000000000..4c799713c99 --- /dev/null +++ b/tools/benchmark/benchmark_ascii_tolower.cc @@ -0,0 +1,136 @@ +/** @file + + Micro benchmark for ts::ascii::tolower_copy against a byte-at-a-time + scalar loop equivalent to the prior URL.cc::memcpy_tolower definition. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#define CATCH_CONFIG_ENABLE_BENCHMARKING + +#include +#include +#include + +#include "tscore/ink_ascii_tolower.h" +#include "tscore/ParseRules.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + +// Sizes chosen to mirror the URL.cc hot path: +// 4-8B - common HTTP scheme strings ("http", "https") +// 16-32B - typical host names +// 64-256B - long host names / cache-key segments +// 1024B - stress the inner loop +constexpr std::array kSizes{4, 8, 16, 24, 32, 64, 256, 1024}; + +// Same character distribution we expect from URL/host input: ASCII letters +// (mixed case), digits, and the small set of non-alpha bytes that legitimately +// appear in URLs. +std::vector +make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL) +{ + std::mt19937_64 rng(seed); + std::vector v(n); + for (std::size_t i = 0; i < n; ++i) { + auto r = static_cast(rng() & 0x3FU); + if (r < 26U) { + v[i] = static_cast('A' + r); + } else if (r < 52U) { + v[i] = static_cast('a' + (r - 26U)); + } else { + static constexpr char kNonAlpha[] = "0123456789-_./:"; + v[i] = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)]; + } + } + return v; +} + +// Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here +// as the baseline the SIMD path is expected to beat. +inline void +tolower_scalar(char *d, const char *s, std::size_t n) noexcept +{ + while (n--) { + *d = ParseRules::ink_tolower(*s); + ++s; + ++d; + } +} + +} // namespace + +TEST_CASE("active SIMD configuration", "[tolower][config]") +{ + // Print the compile-time ISA path so the benchmark output makes the + // selected configuration obvious. + std::cout << "ts::ascii::tolower_copy compiled with: "; +#if defined(__AVX512BW__) + std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade"; +#elif defined(__AVX2__) + std::cout << "AVX2 (32B body) + SSE2 (16B drain)"; +#elif defined(__SSE2__) + std::cout << "SSE2 (16B body)"; +#elif defined(__ARM_NEON) || defined(__aarch64__) + std::cout << "NEON (16B body)"; +#else + std::cout << "scalar only"; +#endif + std::cout << '\n'; + SUCCEED(); +} + +TEST_CASE("tolower throughput", "[bench][tolower]") +{ + for (std::size_t sz : kSizes) { + auto input = make_mixed_case_ascii(sz); + std::vector output_scalar(sz); + std::vector output_simd(sz); + + // Catch::Benchmark::keep_memory clobbers the buffer in the compiler's + // model, forcing it to materialize every byte we wrote. Without this an + // optimizing compiler can shrink or DCE the inline body's stores past + // the first element we observed. + + std::string scalar_name = "scalar " + std::to_string(sz) + "B"; + BENCHMARK(scalar_name.c_str()) + { + tolower_scalar(output_scalar.data(), input.data(), sz); + Catch::Benchmark::keep_memory(output_scalar.data()); + return output_scalar[0]; + }; + + std::string simd_name = "ts::atc " + std::to_string(sz) + "B"; + BENCHMARK(simd_name.c_str()) + { + ts::ascii::tolower_copy(output_simd.data(), input.data(), sz); + Catch::Benchmark::keep_memory(output_simd.data()); + return output_simd[0]; + }; + } +}