-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathp5prof.h
279 lines (256 loc) · 7.15 KB
/
p5prof.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
/*********************************************************
*
* File: p5prof.h
* By: Kevin Baca
*
* MODIFIED BY Fab SO THAT RDMSR(...) WRITES EDX:EAX TO A LONG LONG
* (WHICH MEANS WRITE THE LOW DWORD FIRST)
*
* Now in yer code do:
* long long count,total;
*
* ...
* RDMSR(0x10,&count); //inner loop count
* total += count;
* ...
*
* printf("0x%x %x", (int)total, *((int *)&total+1) );
* // HIGH LOW
*
*********************************************************/
/*********************************************************
* This file provides macros to profile your code.
* Here's how they work...
*
* As you may or may not know, the Pentium class of
* processors provides extremely fine grained profiling
* capabilities through the use of what are called
* Machine Specific Registers (MSRs). These registers
* can provide information about almost any aspect of
* CPU performance down to a single cycle.
*
* The MSRs of interest for profiling are specified by
* indices 0x10, 0x11, 0x12, and 0x13. Here is a brief
* description of each of these registers:
*
* MSR 0x10
* This register is simple a cycle counter.
*
* MSR 0x11
* This register controls what type of profiling data
* will be gathered.
*
* MSRs 0x12 and 0x13
* These registers gather the profiling data specified in
* MSR 0x11.
*
* Each MSR is 64 bits wide. For the Pentium processor,
* only the lower 32 bits of MSR 0x11 are valid. Bits 0-15
* specify what data will be gathered in MSR 0x12. Bits 16-31
* specify what data will be gathered in MSR 0x13. Both sets
* of bits have the same format:
*
* Bits 0-5 specify which hardware event will be tracked.
* Bit 6, if set, indicates events will be tracked in
* rings 0-2.
* Bit 7, if set, indicates events will be tracked in
* ring 3.
* Bit 8, if set, indicates cycles should be counted for
* the specified event. If clear, it indicates the
* number of events should be counted.
*
* Two instructions are provided for manupulating the MSRs.
* RDMSR (Read Machine Specific Register) and WRMSR
* (Write Machine Specific Register). These opcodes were
* originally undocumented and therefore most assemblers don't
* recognize them. Their byte codes are provided in the
* macros below.
*
* RDMSR takes the MSR index in ecx and the profiling criteria
* in edx:eax.
*
* WRMSR takes the MSR index in ecx and returns the profile data
* in edx:eax.
*
* Two profiling registers limits profiling capability to
* gathering only two types of information. The register
* usage can, however, be combined in interesting ways.
* For example, you can set one register to gather the
* number of a specific type of event while the other gathers
* the number of cycles for the same event. Or you can
* gather the number of two separate events while using
* MSR 0x10 to gather the number of cycles.
*
* The enumerated list provides somewhat readable labels for
* the types of events that can be tracked.
*
* For more information, get ahold of appendix H from the
* Intel Pentium programmer's manual (I don't remember the
* order number) or go to
* http://green.kaist.ac.kr/jwhahn/art3.htm.
* That's an article by Terje Mathisen where I got most of
* my information.
*
* You may use this code however you wish. I hope it's
* useful and I hope I got everything right.
*
* -Kevin
*
*
*********************************************************/
#ifdef __GNUC__
#define RDTSC(_dst) \
__asm__("
.byte 0x0F,0x31
movl %%edx,(%%edi)
movl %%eax,4(%%edi)"\
: : "D" (_dst) : "eax", "edx", "edi")
// the old code... swapped it
// movl %%edx,(%%edi)
// movl %%eax,4(%%edi)"
#define RDMSR(_msri, _msrd) \
__asm__("
.byte 0x0F,0x32
movl %%eax,(%%edi)
movl %%edx,4(%%edi)"\
: : "c" (_msri), "D" (_msrd) : "eax", "ecx", "edx", "edi")
#define WRMSR(_msri, _msrd) \
__asm__("
xorl %%edx,%%edx
.byte 0x0F,0x30"\
: : "c" (_msri), "a" (_msrd) : "eax", "ecx", "edx")
#define RDMSR_0x12_0x13(_msr12, _msr13) \
__asm__("
movl $0x12,%%ecx
.byte 0x0F,0x32
movl %%edx,(%%edi)
movl %%eax,4(%%edi)
movl $0x13,%%ecx
.byte 0x0F,0x32
movl %%edx,(%%esi)
movl %%eax,4(%%esi)"\
: : "D" (_msr12), "S" (_msr13) : "eax", "ecx", "edx", "edi")
#define ZERO_MSR_0x12_0x13() \
__asm__("
xorl %%edx,%%edx
xorl %%eax,%%eax
movl $0x12,%%ecx
.byte 0x0F,0x30
movl $0x13,%%ecx
.byte 0x0F,0x30"\
: : : "eax", "ecx", "edx")
#elif defined(__WATCOMC__)
extern void RDTSC(unsigned int *dst);
#pragma aux RDTSC =\
"db 0x0F,0x31"\
"mov [edi],edx"\
"mov [4+edi],eax"\
parm [edi]\
modify [eax edx edi];
extern void RDMSR(unsigned int msri, unsigned int *msrd);
#pragma aux RDMSR =\
"db 0x0F,0x32"\
"mov [edi],edx"\
"mov [4+edi],eax"\
parm [ecx] [edi]\
modify [eax ecx edx edi];
extern void WRMSR(unsigned int msri, unsigned int msrd);
#pragma aux WRMSR =\
"xor edx,edx"\
"db 0x0F,0x30"\
parm [ecx] [eax]\
modify [eax ecx edx];
extern void RDMSR_0x12_0x13(unsigned int *msr12, unsigned int *msr13);
#pragma aux RDMSR_0x12_0x13 =\
"mov ecx,0x12"\
"db 0x0F,0x32"\
"mov [edi],edx"\
"mov [4+edi],eax"\
"mov ecx,0x13"\
"db 0x0F,0x32"\
"mov [esi],edx"\
"mov [4+esi],eax"\
parm [edi] [esi]\
modify [eax ecx edx edi esi];
extern void ZERO_MSR_0x12_0x13(void);
#pragma aux ZERO_MSR_0x12_0x13 =\
"xor edx,edx"\
"xor eax,eax"\
"mov ecx,0x12"\
"db 0x0F,0x30"\
"mov ecx,0x13"\
"db 0x0F,0x30"\
modify [eax ecx edx];
#endif
enum
{
DataRead,
DataWrite,
DataTLBMiss,
DataReadMiss,
DataWriteMiss,
WriteHitEM,
DataCacheLinesWritten,
DataCacheSnoops,
DataCacheSnoopHit,
MemAccessBothPipes,
BankConflict,
MisalignedDataRef,
CodeRead,
CodeTLBMiss,
CodeCacheMiss,
SegRegLoad,
RESERVED0,
RESERVED1,
Branch,
BTBHit,
TakenBranchOrBTBHit,
PipelineFlush,
InstructionsExeced,
InstructionsExecedVPipe,
BusUtilizationClocks,
PipelineStalledWriteBackup,
PipelineStalledDateMemRead,
PipeLineStalledWriteEM,
LockedBusCycle,
IOReadOrWriteCycle,
NonCacheableMemRef,
AGI,
RESERVED2,
RESERVED3,
FPOperation,
Breakpoint0Match,
Breakpoint1Match,
Breakpoint2Match,
Breakpoint3Match,
HWInterrupt,
DataReadOrWrite,
DataReadOrWriteMiss
};
#define PROF_CYCLES (0x100)
#define PROF_EVENTS (0x000)
#define RING_012 (0x40)
#define RING_3 (0x80)
#define RING_0123 (RING_012 | RING_3)
/*void ProfSetProfiles(unsigned int msr12, unsigned int msr13);*/
#define ProfSetProfiles(_msr12, _msr13)\
{\
unsigned int prof;\
\
prof = (_msr12) | ((_msr13) << 16);\
WRMSR(0x11, prof);\
}
/*void ProfBeginProfiles(void);*/
#define ProfBeginProfiles()\
ZERO_MSR_0x12_0x13();
/*void ProfGetProfiles(unsigned int msr12[2], unsigned int msr13[2]);*/
#define ProfGetProfiles(_msr12, _msr13)\
RDMSR_0x12_0x13(_msr12, _msr13);
/*void ProfZeroTimer(void);*/
#define ProfZeroTimer()\
WRMSR(0x10, 0);
/*void ProfReadTimer(unsigned int timer[2]);*/
#define ProfReadTimer(timer)\
RDMSR(0x10, timer);
/*EOF*/