src/profiler.cpp
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /** | ||
| 2 | * @file profiler.cpp | ||
| 3 | * @brief Implementation of the lock-free ring buffer profiler with Chrome Tracing export. | ||
| 4 | */ | ||
| 5 | |||
| 6 | #include "DetourModKit/profiler.hpp" | ||
| 7 | |||
| 8 | #include <windows.h> | ||
| 9 | #include <algorithm> | ||
| 10 | #include <cstdio> | ||
| 11 | #include <format> | ||
| 12 | #include <memory> | ||
| 13 | #include <string> | ||
| 14 | #include <string_view> | ||
| 15 | |||
| 16 | namespace DetourModKit | ||
| 17 | { | ||
| 18 | namespace | ||
| 19 | { | ||
| 20 | /** | ||
| 21 | * @brief Escapes a string for safe embedding in a JSON value. | ||
| 22 | * @details Handles the characters that are special in JSON strings: | ||
| 23 | * backslash, double quote, and control characters (U+0000..U+001F). | ||
| 24 | * Forward slash is NOT escaped (legal unescaped in JSON per RFC 8259). | ||
| 25 | * @param input The raw string to escape. | ||
| 26 | * @return A JSON-safe escaped string (without surrounding quotes). | ||
| 27 | */ | ||
| 28 | 65547 | std::string escape_json_string(std::string_view input) | |
| 29 | { | ||
| 30 | 65547 | std::string out; | |
| 31 |
1/2✓ Branch 4 → 5 taken 65547 times.
✗ Branch 4 → 33 not taken.
|
65547 | out.reserve(input.size()); |
| 32 |
2/2✓ Branch 29 → 7 taken 1114224 times.
✓ Branch 29 → 30 taken 65547 times.
|
1179771 | for (const char c : input) |
| 33 | { | ||
| 34 |
8/8✓ Branch 7 → 8 taken 1 time.
✓ Branch 7 → 10 taken 1 time.
✓ Branch 7 → 12 taken 1 time.
✓ Branch 7 → 14 taken 1 time.
✓ Branch 7 → 16 taken 1 time.
✓ Branch 7 → 18 taken 1 time.
✓ Branch 7 → 20 taken 1 time.
✓ Branch 7 → 22 taken 1114217 times.
|
1114224 | switch (c) |
| 35 | { | ||
| 36 | 1 | case '"': | |
| 37 |
1/2✓ Branch 8 → 9 taken 1 time.
✗ Branch 8 → 33 not taken.
|
1 | out += "\\\""; |
| 38 | 1 | break; | |
| 39 | 1 | case '\\': | |
| 40 |
1/2✓ Branch 10 → 11 taken 1 time.
✗ Branch 10 → 33 not taken.
|
1 | out += "\\\\"; |
| 41 | 1 | break; | |
| 42 | 1 | case '\b': | |
| 43 |
1/2✓ Branch 12 → 13 taken 1 time.
✗ Branch 12 → 33 not taken.
|
1 | out += "\\b"; |
| 44 | 1 | break; | |
| 45 | 1 | case '\f': | |
| 46 |
1/2✓ Branch 14 → 15 taken 1 time.
✗ Branch 14 → 33 not taken.
|
1 | out += "\\f"; |
| 47 | 1 | break; | |
| 48 | 1 | case '\n': | |
| 49 |
1/2✓ Branch 16 → 17 taken 1 time.
✗ Branch 16 → 33 not taken.
|
1 | out += "\\n"; |
| 50 | 1 | break; | |
| 51 | 1 | case '\r': | |
| 52 |
1/2✓ Branch 18 → 19 taken 1 time.
✗ Branch 18 → 33 not taken.
|
1 | out += "\\r"; |
| 53 | 1 | break; | |
| 54 | 1 | case '\t': | |
| 55 |
1/2✓ Branch 20 → 21 taken 1 time.
✗ Branch 20 → 33 not taken.
|
1 | out += "\\t"; |
| 56 | 1 | break; | |
| 57 | 1114217 | default: | |
| 58 |
2/2✓ Branch 22 → 23 taken 1 time.
✓ Branch 22 → 26 taken 1114216 times.
|
1114217 | if (static_cast<unsigned char>(c) < 0x20) |
| 59 | { | ||
| 60 | // Control characters U+0000..U+001F require \uXXXX encoding | ||
| 61 | char buf[8]; | ||
| 62 | 1 | std::snprintf(buf, sizeof(buf), "\\u%04x", | |
| 63 |
1/2✓ Branch 23 → 24 taken 1 time.
✗ Branch 23 → 32 not taken.
|
1 | static_cast<unsigned int>(static_cast<unsigned char>(c))); |
| 64 |
1/2✓ Branch 24 → 25 taken 1 time.
✗ Branch 24 → 32 not taken.
|
1 | out += buf; |
| 65 | } | ||
| 66 | else | ||
| 67 | { | ||
| 68 |
1/2✓ Branch 26 → 27 taken 1114216 times.
✗ Branch 26 → 33 not taken.
|
1114216 | out += c; |
| 69 | } | ||
| 70 | 1114217 | break; | |
| 71 | } | ||
| 72 | } | ||
| 73 | 65547 | return out; | |
| 74 | ✗ | } | |
| 75 | } // namespace | ||
| 76 | |||
| 77 | // --- Profiler --- | ||
| 78 | |||
| 79 | 1 | Profiler::Profiler() | |
| 80 | 1 | : buffer_(std::make_unique<ProfileSample[]>(DEFAULT_CAPACITY)), | |
| 81 | 1 | capacity_(DEFAULT_CAPACITY), | |
| 82 | 1 | mask_(DEFAULT_CAPACITY - 1) | |
| 83 | { | ||
| 84 | LARGE_INTEGER freq; | ||
| 85 |
1/2✓ Branch 4 → 5 taken 1 time.
✗ Branch 4 → 6 not taken.
|
1 | QueryPerformanceFrequency(&freq); |
| 86 | 1 | qpc_frequency_ = freq.QuadPart; | |
| 87 | 1 | } | |
| 88 | |||
| 89 | 1999 | Profiler &Profiler::get_instance() noexcept | |
| 90 | { | ||
| 91 |
3/4✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 8 taken 1998 times.
✓ Branch 4 → 5 taken 1 time.
✗ Branch 4 → 8 not taken.
|
1999 | static Profiler instance; |
| 92 | 2040 | return instance; | |
| 93 | } | ||
| 94 | |||
| 95 | 578729 | void Profiler::record(const char *name, int64_t start_ticks, int64_t end_ticks, | |
| 96 | uint32_t thread_id) noexcept | ||
| 97 | { | ||
| 98 | 578729 | const int64_t delta_ticks = end_ticks - start_ticks; | |
| 99 | |||
| 100 | // Convert ticks to microseconds: (delta * 1'000'000) / frequency. | ||
| 101 | // Use 64-bit intermediate to avoid overflow for deltas up to ~9200 seconds | ||
| 102 | // at a 10 MHz QPC frequency (common on modern hardware). | ||
| 103 | const auto duration_us = static_cast<uint32_t>( | ||
| 104 | 1167424 | std::min<int64_t>((delta_ticks * 1'000'000) / qpc_frequency_, | |
| 105 | 578729 | static_cast<int64_t>(UINT32_MAX))); | |
| 106 | |||
| 107 | 588695 | const size_t idx = write_pos_.fetch_add(1, std::memory_order_relaxed) & mask_; | |
| 108 | |||
| 109 | 588695 | auto &sample = buffer_[idx]; | |
| 110 | |||
| 111 | // Open the write window with a monotonic increment. The result is | ||
| 112 | // guaranteed odd because every closed sequence is even (sequence | ||
| 113 | // starts at 0 in the constructor and reset(), and each record() | ||
| 114 | // contributes exactly +2). Using fetch_add avoids the load-then- | ||
| 115 | // store RMW pattern: a producer preempted between a relaxed load | ||
| 116 | // and its first store could otherwise roll the slot's sequence | ||
| 117 | // backwards if another producer completed a full write on the | ||
| 118 | // same slot in the interim. fetch_add forbids that rollback. | ||
| 119 | // | ||
| 120 | // Design note: if a writer is stalled between its fetch_add and | ||
| 121 | // its final sequence store, and 65536 intervening record() calls | ||
| 122 | // advance write_pos_ past a full buffer wrap, a new writer will | ||
| 123 | // land on the same slot and clobber the stalled writer's data. | ||
| 124 | // This requires the stalled writer to be preempted for the | ||
| 125 | // duration of an entire ring buffer cycle, which is unreachable | ||
| 126 | // at game-modding thread counts and frame rates. We accept this | ||
| 127 | // theoretical imprecision to keep the hot path to a single | ||
| 128 | // fetch_add + two stores with no CAS retry loop. | ||
| 129 | // | ||
| 130 | // Monotonicity is unconditionally guaranteed by fetch_add: per | ||
| 131 | // [atomics.types.operations] the counter cannot roll backwards | ||
| 132 | // regardless of how many producers race on the same slot. Do NOT | ||
| 133 | // replace this with a load-then-store RMW: that would re-introduce | ||
| 134 | // the stale-publish race on wrap collision that this protocol | ||
| 135 | // exists to prevent. | ||
| 136 | static_assert(std::atomic<uint32_t>::is_always_lock_free, | ||
| 137 | "sequence counter must be lock-free for the seqlock protocol"); | ||
| 138 | 526719 | (void)sample.sequence.fetch_add(1, std::memory_order_acq_rel); | |
| 139 | |||
| 140 | 526719 | sample.name = name; | |
| 141 | 526719 | sample.start_ticks = start_ticks; | |
| 142 | 526719 | sample.duration_us = duration_us; | |
| 143 | 526719 | sample.thread_id = thread_id; | |
| 144 | |||
| 145 | // Close the write window. Another +1 keeps the slot's sequence | ||
| 146 | // monotonic and lands it on an even value, signalling a fully | ||
| 147 | // committed sample. Readers that observe an odd value skip this | ||
| 148 | // slot to avoid reading torn fields. | ||
| 149 | 526719 | (void)sample.sequence.fetch_add(1, std::memory_order_release); | |
| 150 | 526719 | } | |
| 151 | |||
| 152 | // Caller must ensure no concurrent record() calls are in flight. | ||
| 153 | // There is no runtime guard because adding an atomic "recording active" | ||
| 154 | // counter would penalize every record() call on the hot path for a | ||
| 155 | // contract that is only relevant during session boundaries. | ||
| 156 | 49 | void Profiler::reset() noexcept | |
| 157 | { | ||
| 158 | 49 | write_pos_.store(0, std::memory_order_relaxed); | |
| 159 |
2/2✓ Branch 21 → 11 taken 3211264 times.
✓ Branch 21 → 22 taken 49 times.
|
3211313 | for (size_t i = 0; i < capacity_; ++i) |
| 160 | { | ||
| 161 | 3211264 | auto &s = buffer_[i]; | |
| 162 | 3211264 | s.sequence.store(0, std::memory_order_relaxed); | |
| 163 | 3211264 | s.name = nullptr; | |
| 164 | 3211264 | s.start_ticks = 0; | |
| 165 | 3211264 | s.duration_us = 0; | |
| 166 | 3211264 | s.thread_id = 0; | |
| 167 | } | ||
| 168 | 49 | } | |
| 169 | |||
| 170 | 13 | std::string Profiler::export_chrome_json() const | |
| 171 | { | ||
| 172 | 13 | const size_t total = write_pos_.load(std::memory_order_relaxed); | |
| 173 | 13 | const size_t count = std::min(total, capacity_); | |
| 174 | |||
| 175 |
2/2✓ Branch 10 → 11 taken 2 times.
✓ Branch 10 → 16 taken 11 times.
|
13 | if (count == 0) |
| 176 | { | ||
| 177 |
1/2✓ Branch 13 → 14 taken 2 times.
✗ Branch 13 → 50 not taken.
|
4 | return "[]"; |
| 178 | } | ||
| 179 | |||
| 180 | // Determine start index: if the buffer has wrapped, start from the | ||
| 181 | // oldest surviving sample; otherwise start from 0. | ||
| 182 |
2/2✓ Branch 16 → 17 taken 1 time.
✓ Branch 16 → 18 taken 10 times.
|
11 | const size_t start_idx = (total > capacity_) ? (total & mask_) : 0; |
| 183 | |||
| 184 | // Pre-allocate: ~120 bytes per JSON event is a reasonable estimate. | ||
| 185 | 11 | std::string json; | |
| 186 |
1/2✓ Branch 20 → 21 taken 11 times.
✗ Branch 20 → 61 not taken.
|
11 | json.reserve(count * 120 + 4); |
| 187 |
1/2✓ Branch 21 → 22 taken 11 times.
✗ Branch 21 → 61 not taken.
|
11 | json += "[\n"; |
| 188 | |||
| 189 | // QPC frequency for converting start_ticks to microseconds | ||
| 190 | 11 | const double ticks_to_us = 1'000'000.0 / static_cast<double>(qpc_frequency_); | |
| 191 | |||
| 192 | 11 | bool first = true; | |
| 193 |
2/2✓ Branch 44 → 23 taken 65547 times.
✓ Branch 44 → 45 taken 11 times.
|
65558 | for (size_t i = 0; i < count; ++i) |
| 194 | { | ||
| 195 | 65547 | const auto &s = buffer_[(start_idx + i) & mask_]; | |
| 196 | |||
| 197 | // Single pre-read sequence check: skip if odd (in-flight write). | ||
| 198 | // A full seqlock would re-check after reading fields to detect | ||
| 199 | // writes that started mid-read, but we intentionally omit the | ||
| 200 | // post-read re-check to avoid a second atomic load per sample | ||
| 201 | // on the export path. The resulting race window is narrow | ||
| 202 | // (a write must start between the sequence load and the field | ||
| 203 | // reads) and benign -- a stale-but-consistent sample may appear | ||
| 204 | // in the export at worst. Same trade-off as InputPoller's | ||
| 205 | // relaxed active_states_ reads (stale by one cycle is acceptable). | ||
| 206 | 65547 | const uint32_t seq = s.sequence.load(std::memory_order_acquire); | |
| 207 |
2/4✓ Branch 31 → 32 taken 65547 times.
✗ Branch 31 → 33 not taken.
✗ Branch 32 → 33 not taken.
✓ Branch 32 → 34 taken 65547 times.
|
65547 | if ((seq & 1) != 0 || s.name == nullptr) |
| 208 | { | ||
| 209 | ✗ | continue; | |
| 210 | } | ||
| 211 | |||
| 212 |
2/2✓ Branch 34 → 35 taken 65536 times.
✓ Branch 34 → 36 taken 11 times.
|
65547 | if (!first) |
| 213 | { | ||
| 214 |
1/2✓ Branch 35 → 36 taken 65536 times.
✗ Branch 35 → 60 not taken.
|
65536 | json += ",\n"; |
| 215 | } | ||
| 216 | 65547 | first = false; | |
| 217 | |||
| 218 | // Chrome Trace Event Format: "X" = complete event (has duration). | ||
| 219 | // Escape the name to produce valid JSON even if the caller | ||
| 220 | // passes a string containing quotes or backslashes. | ||
| 221 | 65547 | const double ts = static_cast<double>(s.start_ticks) * ticks_to_us; | |
| 222 |
1/2✓ Branch 37 → 38 taken 65547 times.
✗ Branch 37 → 53 not taken.
|
65547 | const std::string escaped_name = escape_json_string(s.name); |
| 223 | 65547 | json += std::format( | |
| 224 | R"({{"name":"{}","ph":"X","ts":{:.1f},"dur":{},"pid":1,"tid":{}}})", | ||
| 225 |
2/4✓ Branch 38 → 39 taken 65547 times.
✗ Branch 38 → 56 not taken.
✓ Branch 39 → 40 taken 65547 times.
✗ Branch 39 → 54 not taken.
|
65547 | escaped_name, ts, s.duration_us, s.thread_id); |
| 226 | 65547 | } | |
| 227 | |||
| 228 |
1/2✓ Branch 45 → 46 taken 11 times.
✗ Branch 45 → 61 not taken.
|
11 | json += "\n]"; |
| 229 | 11 | return json; | |
| 230 | 11 | } | |
| 231 | |||
| 232 | 3 | bool Profiler::export_to_file(std::string_view path) const | |
| 233 | { | ||
| 234 |
1/2✓ Branch 2 → 3 taken 3 times.
✗ Branch 2 → 43 not taken.
|
3 | const std::string json = export_chrome_json(); |
| 235 |
1/2✓ Branch 5 → 6 taken 3 times.
✗ Branch 5 → 34 not taken.
|
3 | const std::string path_str(path); |
| 236 | |||
| 237 | ✗ | const auto closer = [](std::FILE *f) | |
| 238 | ✗ | { std::fclose(f); }; | |
| 239 | 3 | std::FILE *file_ptr = nullptr; | |
| 240 | |||
| 241 |
1/2✓ Branch 8 → 9 taken 3 times.
✗ Branch 8 → 39 not taken.
|
3 | const errno_t err = fopen_s(&file_ptr, path_str.c_str(), "wb"); |
| 242 |
3/4✓ Branch 9 → 10 taken 2 times.
✓ Branch 9 → 11 taken 1 time.
✗ Branch 10 → 11 not taken.
✓ Branch 10 → 12 taken 2 times.
|
3 | if (err != 0 || file_ptr == nullptr) |
| 243 | { | ||
| 244 | 1 | return false; | |
| 245 | } | ||
| 246 | |||
| 247 | 2 | std::unique_ptr<std::FILE, decltype(closer)> fp(file_ptr, closer); | |
| 248 |
1/2✓ Branch 16 → 17 taken 2 times.
✗ Branch 16 → 37 not taken.
|
2 | const size_t written = std::fwrite(json.data(), 1, json.size(), fp.get()); |
| 249 |
1/2✗ Branch 18 → 19 not taken.
✓ Branch 18 → 20 taken 2 times.
|
2 | if (written != json.size()) |
| 250 | { | ||
| 251 | ✗ | return false; | |
| 252 | } | ||
| 253 |
2/4✓ Branch 21 → 22 taken 2 times.
✗ Branch 21 → 37 not taken.
✗ Branch 22 → 23 not taken.
✓ Branch 22 → 24 taken 2 times.
|
2 | if (std::fflush(fp.get()) != 0) |
| 254 | { | ||
| 255 | ✗ | return false; | |
| 256 | } | ||
| 257 | // Release the pointer so unique_ptr does not double-close. | ||
| 258 |
2/4✓ Branch 25 → 26 taken 2 times.
✗ Branch 25 → 37 not taken.
✗ Branch 26 → 27 not taken.
✓ Branch 26 → 28 taken 2 times.
|
2 | if (std::fclose(fp.release()) != 0) |
| 259 | { | ||
| 260 | ✗ | return false; | |
| 261 | } | ||
| 262 | 2 | return true; | |
| 263 | 3 | } | |
| 264 | |||
| 265 | 13 | size_t Profiler::total_samples_recorded() const noexcept | |
| 266 | { | ||
| 267 | 26 | return write_pos_.load(std::memory_order_relaxed); | |
| 268 | } | ||
| 269 | |||
| 270 | 6 | size_t Profiler::available_samples() const noexcept | |
| 271 | { | ||
| 272 | 12 | return std::min(write_pos_.load(std::memory_order_relaxed), capacity_); | |
| 273 | } | ||
| 274 | |||
| 275 | 4 | size_t Profiler::capacity() const noexcept | |
| 276 | { | ||
| 277 | 4 | return capacity_; | |
| 278 | } | ||
| 279 | |||
| 280 | 1 | int64_t Profiler::qpc_frequency() const noexcept | |
| 281 | { | ||
| 282 | 1 | return qpc_frequency_; | |
| 283 | } | ||
| 284 | |||
| 285 | // --- ScopedProfile --- | ||
| 286 | |||
| 287 | 1954 | ScopedProfile::ScopedProfile(const char *name, literal_tag) noexcept | |
| 288 | 1954 | : name_(name), thread_id_(GetCurrentThreadId()) | |
| 289 | { | ||
| 290 | LARGE_INTEGER ticks; | ||
| 291 | 1956 | QueryPerformanceCounter(&ticks); | |
| 292 | 1953 | start_ticks_ = ticks.QuadPart; | |
| 293 | 1953 | } | |
| 294 | |||
| 295 | 1964 | ScopedProfile::~ScopedProfile() noexcept | |
| 296 | { | ||
| 297 | LARGE_INTEGER ticks; | ||
| 298 | 1964 | QueryPerformanceCounter(&ticks); | |
| 299 | 1958 | Profiler::get_instance().record(name_, start_ticks_, ticks.QuadPart, thread_id_); | |
| 300 | 1972 | } | |
| 301 | |||
| 302 | } // namespace DetourModKit | ||
| 303 |