Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,10 @@ LS-HPACK provides functionality to encode and decode HTTP headers using
HPACK compression mechanism specified in RFC 7541.
Copyright (c) 2018 - 2023 LiteSpeed Technologies Inc, (MIT License)
https://github.com/litespeedtech/ls-hpack.git

~~

include/tscore/ink_ascii_tolower.h AVX-512BW kernel design (fused
mask_add and masked-tail load/store) is adapted from Tony Finch's
copytolower64.c (0BSD OR MIT-0).
https://dotat.at/cgi/git/vectolower.git/
185 changes: 185 additions & 0 deletions include/tscore/ink_ascii_tolower.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
/** @file

SIMD-accelerated bulk ASCII tolower copy.

Used on the URL canonicalization fast path for cache-key digests
(src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast) and any other place
that needs to fold ASCII to lowercase over a small-to-moderate
buffer. The scalar byte-at-a-time loop is the bottleneck for hosts
and schemes long enough to vectorize; for shorter inputs the scalar
tail handles them with no SIMD overhead.

Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():

- Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to
'a'..'z'. All other bytes (including 0x80..0xFF) pass through
unchanged. There is no UTF-8 case folding.

- In-place use (dst == src) is supported on every path. Each SIMD
body loads a full block into a register before storing back at
the same offset, and the AVX-512BW masked tail does masked-load
/ masked-store at the same offset. Partial overlap where
dst != src but the ranges intersect is not supported.

Implementation note: selection is purely compile-time; no runtime
dispatch. Bodies are stacked widest-first.

- AVX-512BW builds: when n >= 64, a 64-byte main loop handles the
bulk and a single masked load/store finishes any 1..63-byte tail,
then we return. When n < 64, we fall through to the AVX2 + SSE2
cascade below so tiny inputs avoid the masked tail's fixed setup
cost.

- AVX2 builds: a 32-byte main loop drains to a 16-byte SSE2 step
and then to a scalar tail of 0..15 bytes.

- SSE2 / NEON builds: a single 16-byte main loop drains to a
scalar tail.

- Other targets: scalar only.

@section license License

Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#pragma once

#include <cstddef>
#include <cstdint>

#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__)
#include <immintrin.h>
#elif defined(__ARM_NEON) || defined(__aarch64__)
#include <arm_neon.h>
#endif

namespace ts::ascii
{

inline void
tolower_copy(char *dst, const char *src, std::size_t n) noexcept
{
#if defined(__AVX512BW__)
// AVX-512BW: 64 bytes per iteration with two key optimizations over the
// narrower paths:
// - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single
// op (no separate maskz_set1 + or).
// - A masked load/store handles the 1..63-byte tail in a single SIMD
// pass, so we don't need to cascade to AVX2/SSE2 to drain the
// remainder.
//
// The masked tail does carry ~7 ns of fixed setup cost, which loses to
// the cascade on short inputs. Gating the whole block on n >= 64 means
// tiny inputs fall through to the AVX2/SSE2 path below, where they keep
// the speedup that path already provides.
//
// Adapted from Tony Finch's copytolower64.c (see NOTICE).
if (n >= 64) {
const __m512i A_vec = _mm512_set1_epi8('A');
const __m512i Z_vec = _mm512_set1_epi8('Z');
const __m512i delta = _mm512_set1_epi8('a' - 'A');
do {
__m512i bytes = _mm512_loadu_epi8(src);
__mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
_mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
src += 64;
dst += 64;
n -= 64;
} while (n >= 64);
if (n != 0) {
auto len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n));
__m512i bytes = _mm512_maskz_loadu_epi8(len_mask, src);
__mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
_mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
}
return;
}
#endif

#if defined(__AVX2__)
// 32 bytes per iteration. Same compare-and-OR pattern as SSE2.
{
const __m256i a_minus_one = _mm256_set1_epi8('A' - 1);
const __m256i z_plus_one = _mm256_set1_epi8('Z' + 1);
const __m256i bit5 = _mm256_set1_epi8(0x20);
while (n >= 32) {
__m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
__m256i ge_A = _mm256_cmpgt_epi8(bytes, a_minus_one);
__m256i le_Z = _mm256_cmpgt_epi8(z_plus_one, bytes);
__m256i mask = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask));
src += 32;
dst += 32;
n -= 32;
}
}
#endif

#if defined(__SSE2__)
// 16 bytes per iteration. Signed compare works for ASCII A-Z because all
// letters live below 0x80; high bytes (0x80..0xFF) compare as negative
// and correctly miss the [A,Z] range so they pass through unchanged.
{
const __m128i a_minus_one = _mm_set1_epi8('A' - 1);
const __m128i z_plus_one = _mm_set1_epi8('Z' + 1);
const __m128i bit5 = _mm_set1_epi8(0x20);
while (n >= 16) {
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
__m128i ge_A = _mm_cmpgt_epi8(bytes, a_minus_one);
__m128i le_Z = _mm_cmpgt_epi8(z_plus_one, bytes);
__m128i mask = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask));
src += 16;
dst += 16;
n -= 16;
}
}
#elif defined(__ARM_NEON) || defined(__aarch64__)
// 16 bytes per iteration; unsigned compare available natively.
{
const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1);
const uint8x16_t z_plus_one = vdupq_n_u8('Z' + 1);
const uint8x16_t bit5 = vdupq_n_u8(0x20);
while (n >= 16) {
uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
uint8x16_t ge_A = vcgtq_u8(bytes, a_minus_one);
uint8x16_t le_Z = vcltq_u8(bytes, z_plus_one);
uint8x16_t mask = vandq_u8(vandq_u8(ge_A, le_Z), bit5);
vst1q_u8(reinterpret_cast<uint8_t *>(dst), vorrq_u8(bytes, mask));
src += 16;
dst += 16;
n -= 16;
}
}
#endif

while (n--) {
auto c = static_cast<unsigned char>(*src++);
*dst++ = (c >= 'A' && c <= 'Z') ? static_cast<char>(c | 0x20) : static_cast<char>(c);
}
}

// Thin sugar over tolower_copy for the in-place case. Makes call sites
// like ts::ascii::tolower_inplace(buf, n) read naturally instead of
// ts::ascii::tolower_copy(buf, buf, n).
inline void
tolower_inplace(char *buf, std::size_t n) noexcept
{
tolower_copy(buf, buf, n);
}

} // namespace ts::ascii
15 changes: 3 additions & 12 deletions src/proxy/hdrs/URL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <new>
#include "tscore/ink_platform.h"
#include "tscore/ink_memory.h"
#include "tscore/ink_ascii_tolower.h"
#include "proxy/hdrs/URL.h"
#include "proxy/hdrs/MIME.h"
#include "proxy/hdrs/HTTP.h"
Expand Down Expand Up @@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */)
* *
***********************************************************************/

static inline void
memcpy_tolower(char *d, const char *s, int n)
{
while (n--) {
*d = ParseRules::ink_tolower(*s);
s++;
d++;
}
}

// fast path for CryptoHash, HTTP, no user/password/params/query,
// no buffer overflow, no unescaping needed

Expand All @@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
char *p;

p = buffer;
memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
ts::ascii::tolower_copy(p, url->m_ptr_scheme, url->m_len_scheme);
p += url->m_len_scheme;
*p++ = ':';
*p++ = '/';
Expand All @@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
*p++ = ':';
// no password
*p++ = '@';
memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
ts::ascii::tolower_copy(p, url->m_ptr_host, url->m_len_host);
p += url->m_len_host;
*p++ = '/';
memcpy(p, url->m_ptr_path, url->m_len_path);
Expand Down
33 changes: 33 additions & 0 deletions src/proxy/hdrs/unit_tests/test_URL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,39 @@ std::vector<get_hash_test_case> get_hash_test_cases = {
!IGNORE_QUERY,
HAS_EQUAL_HASH,
},
{
// Verifies the scheme/host SIMD-tolower path in url_CryptoHash_get_fast:
// an uppercase host with a long enough prefix to hit the 16-byte SIMD
// body should hash identically to its lowercased form.
"Uppercase host: equal hashes",
"http://ONE.EXAMPLE.COM/a/path?name=value",
"http://one.example.com/a/path?name=value",
!IGNORE_QUERY,
HAS_EQUAL_HASH,
},
{
"Mixed-case host: equal hashes",
"http://One.Example.Com/a/path?name=value",
"http://one.example.com/a/path?name=value",
!IGNORE_QUERY,
HAS_EQUAL_HASH,
},
{
"Uppercase scheme: equal hashes",
"HTTP://one.example.com/a/path?name=value",
"http://one.example.com/a/path?name=value",
!IGNORE_QUERY,
HAS_EQUAL_HASH,
},
{
// Long uppercase host crosses 16- and 32-byte SIMD body boundaries so
// the wider paths (when compiled in) are exercised by this fixture.
"Long uppercase host: equal hashes",
"http://A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM/a/path",
"http://a-very-long-host-name-for-simd.example.com/a/path",
!IGNORE_QUERY,
HAS_EQUAL_HASH,
},
};

/** Return the hash related to a URI.
Expand Down
7 changes: 3 additions & 4 deletions src/proxy/http/remap/UrlRewrite.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

*/

#include "tscore/ink_ascii_tolower.h"

#include "proxy/http/remap/UrlRewrite.h"
#include "proxy/http/remap/RemapYamlConfig.h"
#include "iocore/eventsystem/ConfigProcessor.h"
Expand Down Expand Up @@ -931,10 +933,7 @@ UrlRewrite::_mappingLookup(MappingsStore &mappings, URL *request_url, int reques
return false;
}

// lowercase
for (int i = 0; i < request_host_len; ++i) {
request_host_lower[i] = tolower(request_host[i]);
}
ts::ascii::tolower_copy(request_host_lower, request_host, request_host_len);
request_host_lower[request_host_len] = 0;

bool retval = false;
Expand Down
55 changes: 55 additions & 0 deletions src/proxy/http/remap/unit-tests/test_RemapRules.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,58 @@ map_with_recv_port http://front.example.com \
}
}
}

SCENARIO("UrlRewrite host lookup is case-insensitive", "[proxy][remap]")
{
// _mappingLookup lower-cases the request host before consulting the hash
// table; these scenarios exercise that path with inputs that would not
// match in a strict byte-compare. Sized to cross the 16-byte SSE2 body
// for hosts that get a real SIMD pass.
GIVEN("A forward map with a lowercase source host")
{
auto urlrw = std::make_unique<UrlRewrite>();
std::string config = R"RMCFG(
map http://www.example.com http://origin.example.com
)RMCFG";

auto cpath = write_test_remap(config, "case_insensitive");
int rc = urlrw->BuildTable(cpath.c_str());
REQUIRE(rc == TS_SUCCESS);
REQUIRE(urlrw->rule_count() == 1);

EasyURL url("http://www.example.com");
UrlMappingContainer urlmap;

THEN("uppercase request host matches the lowercase rule")
{
const char *host = "WWW.EXAMPLE.COM";
REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
}
THEN("mixed-case request host matches the lowercase rule")
{
const char *host = "Www.Example.Com";
REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
}
}

GIVEN("A forward map with a long host that exercises the 16-byte SIMD body")
{
auto urlrw = std::make_unique<UrlRewrite>();
std::string config = R"RMCFG(
map http://a-very-long-host-name-for-simd.example.com http://origin.example.com
)RMCFG";

auto cpath = write_test_remap(config, "case_insensitive_long");
int rc = urlrw->BuildTable(cpath.c_str());
REQUIRE(rc == TS_SUCCESS);

EasyURL url("http://a-very-long-host-name-for-simd.example.com");
UrlMappingContainer urlmap;

THEN("an all-uppercase 49-char host (covers >=32 SIMD bytes) matches")
{
const char *host = "A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM";
REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
}
}
}
5 changes: 2 additions & 3 deletions src/proxy/http2/HPACK.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include "proxy/http2/HPACK.h"

#include "tscore/ink_ascii_tolower.h"
#include "tsutil/LocalBuffer.h"
#include "swoc/TextView.h"

Expand Down Expand Up @@ -789,9 +790,7 @@ hpack_encode_header_block(HpackIndexingTable &indexing_table, uint8_t *out_buf,
int name_len = original_name.size();
ts::LocalBuffer<char> local_buffer(name_len);
char *lower_name = local_buffer.data();
for (int i = 0; i < name_len; i++) {
lower_name[i] = ParseRules::ink_tolower(original_name[i]);
}
ts::ascii::tolower_copy(lower_name, original_name.data(), name_len);

std::string_view name{lower_name, static_cast<size_t>(name_len)};
std::string_view value = field.value_get();
Expand Down
Loading