;
; File        : PROFILE.INC
; Description : Profiling macros (MSR and TSC)
; Notes       :
;
;


;
; DATA
;
_DATA   segment use32 dword public 'DATA'
	assume  ds:_DATA
;

global	_TSC:dword			; TSC count

global	_prof0:dword			; Timer 0 count
global	_prof1:dword			; Timer 1 count

global	_profsub0:dword			; Timer 0 overhead count
global	_profsub1:dword			; Timer 1 overhead count

global	_profdsub0:dword		; Dummy overhead sub's for when
global	_profdsub1:dword		; timing the overhead code.

;
ends

; Basic macro replacements
;
; Incase your compiler doesn't support these 586 instructions.
;

; Msr stuff..
; index = ecx
; value = edx:eax
;RDMSR   MACRO
;	db 0fh,032h
;endm
;
;WRMSR   MACRO
;	db 0fh,030h
;endm
;
;RDTSC   MACRO
;	db 0fh,031h
;endm



;
; Macros for the TSC cycle counter
; To get 'propper' cycle counts from the TSC.. The other timing macros
; are event based, these TSC macros hopefully get true cycle counts from
; code.
;
; There may be a better way to do this, but this works..
; The imul completly stalls the pipeline :) and makes sure the instruction
; pairing is predictable...
;
; Any instruction that stalls the pipeline would do, like another
; pushad/popad would work just fine...
;
; I have found that most instructions can pair with RDTSC..
; eg. cdq does.. but using two cdq's in a row would stall the pipe.
;
; Of course there may always be another explaination for this :)
;
; Try using these macros after a CALL and after some code and see how
; changing the imul affects things.
;
;
; Start TSC cycle counter..
;
StartTSC	MACRO

  ; Try removing the following pushad/popad, then using
  ; StartTSC/EndTSC just after a CALL.. Any you'll see the
  ; stack cache-miss penalty..
  ;

  pushad		; cache stack stuff (for popd after RDTSC)
  popad

  pushad

  mov eax,_TSC		; cache _TSC

  imul _TSC		; stall pipeline. RDTSC executes in U-pipe...
  ;cdq			; these also work.
  ;cdq

  db 0fh,031h		; RDTSC - get start count
  mov _TSC,eax		; save start count

  popad

ENDM

; End TSC cycle counter..
;
EndTSC		MACRO

  ; Note.. the pushad is affected by the cache.
  ; no way to get around this one :(
  ; Just remeber this, if you're code pushes cached stack values out of
  ; the cache then you'll have an inacurracy.

  pushad		; does not pair.. so stalls pipeline..


  imul edx		; this ISN'T needed, but it's just incase
			; the pushad decides to pair with RDTSC..
			; which I think it can..

  db 0fh,031h		; RDTSC - get end count
  sub eax,27 		; overhead cycles..
  sub eax,_TSC		; get range of count
  mov _TSC,eax		; save range
  popad
ENDM


s	equ StartTSC
e	equ EndTSC



;
; General MSR timing macros
;
; Timer bits:
;  0-5  - Event
;  6    - Count system overhead (CPL 0-2)
;  7    - Count user code (CPL 3)
;  8    - 0 = Count events, 1 = count cycles
;  9    - 0 = show counter incs, 1 = show counter overflows

USER_CODE    = 0010000000b
SYS_CODE     = 0001000000b

COUNT_EVENTS = 0000000000b
COUNT_CYCLES = 0100000000b

PM0_INCS     = 0000000000b
PM0_OVERFLOWS= 1000000000b

; Setup the MSR timers to count something.
;
; Trashes eax ecx edx
;
SetMSRTimers	MACRO	TIMER0,TIMER1
  mov ecx,011h		; MSR 11h
  xor edx,edx		; top 32bits empty
  mov ax,TIMER1		; timer#0
  or  ax,SYS_CODE	; time system code (CPL 0)
  shl eax,16
  mov ax,TIMER0		; timer#1
  or  ax,SYS_CODE	; time system code (CPL 0)
  db 0fh,030h		; WRMSR
ENDM

; These just trash ecx...
ReadMSRTimer0	MACRO
  mov ecx,012h		; Timer #0
  db 0fh,032h		; RDMSR
ENDM

ReadMSRTimer1	MACRO
  mov ecx,013h		; Timer #1
  db 0fh,032h		; RSMSR
ENDM

;
; Macros to profile some code..
;
; Note: The original registers are restored at the end... this may bugger
; up return values if you're profiling an entire function
;
;
StartProfile MACRO TIMER0 , TIMER1

	; first get overhead for each timer..
	;
	; this is by no means perfect, but does allow both timers
	; to produce the same results. Without this the timers
	; would produce different results when timeing the same
	; thing (because timer1 also times the read of timer0)
	;
	; if I could get invd (invalidate cache) to work then almost
	; perfect times could be deduced, but either emm386 or dos4/g
	; won't let me use it.. (causes an exception)
	;

	;pushad				; save regs twice
	pushad
	SetMSRTimers TIMER0 , TIMER1	; setup timers
	ReadMSRTimer0
	mov _profsub0,eax		; save start of timer0
	ReadMSRTimer1
	mov _profsub1,eax		; save start of timer1
	popad				; get back original regs

	; timed code here.. none just timing overhead

	pushad

	ReadMSRTimer0
	sub eax,_profdsub0		; dummy sub
	sub eax,_profsub0		; sub start value
	mov _profsub0,eax		; save overhead of timer0
	ReadMSRTimer1
	sub eax,_profdsub1		; dummy sub
	sub eax,_profsub1		; sub start value
	mov _profsub1,eax		; save overhead of timer1

	popad				; get back original regs


	; now actually start the timings for real..
	;

	;pushad
	pushad
	SetMSRTimers TIMER0 , TIMER1
	ReadMSRTimer0
	mov _prof0,eax
	ReadMSRTimer1
	mov _prof1,eax
	popad				; get back original regs
ENDM

EndProfile MACRO
	pushad

	ReadMSRTimer0
	sub eax,_profsub0		; sub overhead time
	sub eax,_prof0			; sub start time
	mov _prof0,eax			; save timer0
	ReadMSRTimer1
	sub eax,_profsub1		; sub overhead time
	sub eax,_prof1			; sub start time
	mov _prof1,eax			; save timer1

	popad				; finally restore the original regs
ENDM
