This post continues the series on performance monitoring with Intel MSRs on Linux using the batch-oriented kernel module to read and write values from and to the MSRs. The previous posts can be found here:
- A Linux Module For Reading/Writing MSRs
- Intel MSR Performance Monitoring Basics
- Fun with MSRs: Counting Performance Events On Intel
- Scripting MSR Performance Tests With kdb+
- Scripting MSR Performance Tests With kdb+: Part 2: this post
- Intel Performance Monitoring: Loose Ends
This time I’m going to build the shared library used by kdb+ to launch and control the test run. It’s fairly simple, since the fiddly work of calculating the values to be written to the IA32_PERFEVTSELx
, IA32_FIXED_CTR_CTRL
and IA32_PERF_GLOBAL_CTRL
MSRs has already been done. What it will do is own the process of stopping, clearing and staring the counters, as well as running a baseline to test the fixed costs of the interation with the MSR kernel module.
The following q instructs kdb+ to load the function runtest
from the shared library libpmc.so
. The rules for locating the shared library are fairly simple and documented on the code.kx.com site, but it’s probably enough to know that it eventually consults the environment variable LD_LIBRARY_PATH
. The command instructs kdb+ to treat the runtest
function as taking five arguments. Unlike a compiler, which will check the arity of the function for you, it won’t warn you if you get the number of arguments wrong!
.pmc.runtestdl:\`libpmc 2:(\`runtest;5);
The runtest
function from libpmc.c
is the entry point to the shared library code from kdb+. The parameters and return types are all of type K
, which is the wrapper kdb for each of its objects. The K
(strictly k0
) struct contains fields for reference counting, type descriptor, optionally a count value (for vector-types) and then the payload. The function sets up some stack storage before writing the MsrInOut
values computed by pmc.q
to memory for later execution by the MSR kernel driver. After loading the driver it delegates the test set-up to the run_test_internal
function.
libpmc.c.fragment
(K opv, K ecxv, K eaxv, K edxv, K testCount)
K runtest{
struct MsrInOut s_pmc_reset[9];
struct MsrInOut s_pmc_read[9];
unsigned long long s_ffc_fixed[FFC_COUNT];
unsigned long long s_pmc_fixed[PMC_COUNT];
struct MsrInOut *ptr;
int i;
long long count;
;
K result
// set the global (static) pointers
= s_ffc_fixed;
ffc_fixed = s_pmc_fixed;
pmc_fixed = s_pmc_reset;
pmc_reset = s_pmc_read;
pmc_read = pmc_cfg = (struct MsrInOut*)malloc((opv->n + 1) * sizeof(struct MsrInOut));
ptr
if (pmc_cfg == NULL) {
("malloc");
orrreturn (K)0;
}
();
record_reset();
record_read
// record the PMC instructions to memory
= opv->n;
count for (i = 0 ; i < count ; i++) {
(ptr++, kI(opv)[i], kI(ecxv)[i], kI(eaxv)[i], kI(edxv)[i]);
wr_msrio}
(ptr++);
msr_wr_stop
();
loadDriverif (fd == -1) {
return (K)0;
}
= run_test_internal(testCount->i);
result
// disable and zero the PMC MSRs
(fd, IOCTL_MSR_CMDS, (long long)s_pmc_reset);
ioctl
// return the dynamically allocated memory
(pmc_cfg);
free// close the MSR driver
(fd);
closeDriver
return result;
}
The following segment of libpmc.c
shows the run_test_internal
function as well as the controller functions for starting and stopping the PMC counters. The run_test_internal
function zeros the accumulators into which the baseline, fixed-cost values are written, then instantiates up the result vectors before delegating execution to the test-harness itself.
libpmc.c.fragment
#define FFC_COUNT 3
#define PMC_COUNT 4
extern void execute_baseline(int times, void (start_counters)(void), void (stop_counters)(void));
extern void execute_test(void (start_counters)(void), void (stop_counters)(void));
void start_counters()
{
(fd, IOCTL_MSR_CMDS, (long long)pmc_cfg);
ioctlreturn;
}
void stop_counters()
{
(fd, IOCTL_MSR_CMDS, (long long)pmc_read);
ioctlreturn;
}
void start_baseline()
{
(fd, IOCTL_MSR_CMDS, (long long)pmc_cfg);
ioctlreturn;
}
void stop_baseline()
{
(fd, IOCTL_MSR_CMDS, (long long)pmc_read);
ioctlreturn;
}
static K run_test_internal(int testCount)
{
int i;
, kffc[3], kpmc[4];
K result
for (i = 0 ; i < PMC_COUNT ; i++)
[i] = 0;
pmc_fixedfor (i = 0 ; i < FFC_COUNT ; i++)
[i] = 0;
ffc_fixed
(fd, IOCTL_MSR_CMDS, (long long)pmc_reset);
ioctl(testCount, &start_baseline, &stop_baseline);
execute_baseline[0] = pmc_read[1].value / testCount;
pmc_fixed[1] = pmc_read[2].value / testCount;
pmc_fixed[2] = pmc_read[3].value / testCount;
pmc_fixed[3] = pmc_read[4].value / testCount;
pmc_fixed[0] = pmc_read[5].value / testCount;
ffc_fixed[1] = pmc_read[6].value / testCount;
ffc_fixed[2] = pmc_read[7].value / testCount;
ffc_fixed
for (i = 0 ; i < PMC_COUNT ; i++)
[i] = ktn(KJ, testCount);
kpmc
for (i = 0 ; i < FFC_COUNT ; i++)
[i] = ktn(KJ, testCount);
kffc
for (i = 1 ; i < 1 + PMC_COUNT + FFC_COUNT ; i++)
[i].value = 0;
pmc_read
for (i = 0 ; i < testCount ; i++) {
(fd, IOCTL_MSR_CMDS, (long long)pmc_reset);
ioctl(&start_counters, &stop_counters);
execute_test(kpmc[0])[i] = pmc_read[1].value - pmc_fixed[0];
kJ(kpmc[1])[i] = pmc_read[2].value - pmc_fixed[1];
kJ(kpmc[2])[i] = pmc_read[3].value - pmc_fixed[2];
kJ(kpmc[3])[i] = pmc_read[4].value - pmc_fixed[3];
kJ(kffc[0])[i] = pmc_read[5].value - ffc_fixed[0];
kJ(kffc[1])[i] = pmc_read[6].value - ffc_fixed[1];
kJ(kffc[2])[i] = pmc_read[7].value - ffc_fixed[2];
kJ}
= knk(7, kffc[0], kffc[1], kffc[2], kpmc[0], kpmc[1], kpmc[2], kpmc[3]);
result return result;
}
Getting a representative fixed-cost baseline
The run_test_internal
delegates the generation of baseline values for the fixed-costs of starting and stopping the performance counters to an external function. That function should simulate as closely as possible the “shoe-leather” costs associated with invoking the start_baseline
and stop_baseline
functions. If the code under test is short with no external dependencies on its output, you could get away with providing an implementation of execute_baseline
which does no more than the following:
void execute_baseline(int times, void (start_baseline)(void), void (stop_baseline)(void))
{
int i;
for (i = 0 ; i < times ; i++) {
();
start_baseline();
stop_baseline}
}
On the other hand, if you have a longer piece of code and you care about preserving the state of the registers, you would need to use an execute_baseline
implementation which reflected the stores from register to stack of any data in non-durable registers in the same way the compiler will do for the code under test after the introduction of the calls to start_counters
and stop_counters
. By “non-durable”, I mean data stored in the SSE or AVX registers, as well as general purpose registers not in the set { rbx, rbp, rsp, r12, r13, r14, r15 }
.
The whole point about passing function pointers to the test to control the performance monitoring is that it makes it possible to invoke them around the smallest sections of your code - and give or take some jitter measure its performance. For example, I’ve listed below an entirely hideous Gnu assembler macro which pushes all of the non-durable registers and some of the xmm
registers onto the stack, and another to do the reverse - and so my execute_baseline
implementation invokes those macros between the calls to start_baseline
and stop_baseline
. Not rocket science, but something may only notice if you were looking at your compiler’s assembly output. Put another way, introducing the start_counters
and stop_counters
calls into your code has side-effects which you should take into account.
Example of a slightly elaborate baseline implementation
.macro m_save_regs
movaps %xmm0, -0x10(%rsp)
movaps %xmm1, -0x20(%rsp)
movaps %xmm2, -0x30(%rsp)
movaps %xmm3, -0x40(%rsp)
movq %rax, -0x48(%rsp)
movq %rbx, -0x50(%rsp)
movq %rcx, -0x58(%rsp)
movq %rdx, -0x60(%rsp)
movq %rdi, -0x68(%rsp)
movq %rsi, -0x70(%rsp)
movq %r8, -0x78(%rsp)
movq %r9, -0x80(%rsp)
movq %r12, -0x88(%rsp)
movq %r13, -0x90(%rsp)
movq %r14, -0x98(%rsp)
movq %r15, -0xa0(%rsp)
sub $0xa0, %rsp
.endm
.macro m_restore_regs
$0xa0, %rsp
add movq -0xa0(%rsp), %r15
movq -0x98(%rsp), %r14
movq -0x90(%rsp), %r13
movq -0x88(%rsp), %r12
movq -0x80(%rsp), %r9
movq -0x78(%rsp), %r8
movq -0x70(%rsp), %rsi
movq -0x68(%rsp), %rdi
movq -0x60(%rsp), %rdx
movq -0x58(%rsp), %rcx
movq -0x50(%rsp), %rbx
movq -0x48(%rsp), %rax
movaps -0x40(%rsp), %xmm3
movaps -0x30(%rsp), %xmm2
movaps -0x20(%rsp), %xmm1
movaps -0x10(%rsp), %xmm0
.endm
.section .text
.globl execute_baseline
.type execute_baseline, STT_FUNC
# void execute_baseline(
# int times,
# void (start_counters)(void),
# void (stop_counters)(void)
# );
execute_baseline:
push %rbp
$0, %rdi
cmp je .LloopEnd
.LloopStart:
m_save_regscall *%rsi
m_restore_regs
m_save_regscall *%rdx
m_restore_regssub $1, %rdi
jg .LloopStart
.LloopEnd:
popq %rbp
ret
Example test-harness code
The following is a simple example of how you might implement a test to profile the performance of the gettimeofday
function. It really is that simple.
example_test_harness.c
#include <sys/time.h>
#include <stdlib.h>
void execute_test(void (start_counters)(void), void (stop_counters)(void))
{
struct timeval tv;
int i;
for (i = 1 ; i < 10 ; i++) {
();
start_counters(&tv, NULL);
gettimeofday();
stop_counters}
}
Next time I’ll put it all together and run some performance tests…