//pbKit core functions //see AFL license //#define DBG //#define LOG #include #include #include #include #include #include "pbKit.h" #include "outer.h" #include "nv_objects.h" //shared with renouveau files #include "nv20_shader.h" //(search "nouveau" on wiki) #include #include #include #include #define INSTANCE_MEM_MAXSIZE 0x5000 //20Kb #define ADDR_SYSMEM 1 #define ADDR_FBMEM 2 #define ADDR_AGPMEM 3 #define DMA_CLASS_2 2 #define DMA_CLASS_3 3 #define DMA_CLASS_3D 0x3D #define GR_CLASS_30 0x30 #define GR_CLASS_39 0x39 #define GR_CLASS_62 0x62 #define GR_CLASS_97 0x97 #define GR_CLASS_9F 0x9F #define GPU_IRQ 3 #define XTAL_16MHZ 16.6667f #define DW_XTAL_16MHZ 16666666 #define MAX_EXTRA_BUFFERS 8 #define MAXRAM 0x03FFAFFF #define NONE -1 #define TICKSTIMEOUT 100 //if Dma doesn't react in that time, send a warning #define PB_SETOUTER 0xB2A #define PB_SETNOISE 0xBAA #define PB_FINISHED 0xFAB struct s_CtxDma { DWORD ChannelID; DWORD Inst; //Addr in PRAMIN area, unit=16 bytes blocks, baseaddr=VIDEO_BASE+NV_PRAMIN DWORD Class; DWORD isGr; }; struct s_PseudoReg { int reg; int num; union { int msk; int swz; }; int mod; int idx; }; struct s_PseudoRegs { int n; struct s_PseudoReg dest; struct s_PseudoReg src0; struct s_PseudoReg src1; struct s_PseudoReg src2; }; static int pb_running=0; static DWORD pb_vbl_counter=0; #ifdef DBG static int pb_trace_mode=1; #else static int pb_trace_mode=0; #endif //if set, we wait after each block sending (pb_end) //so we are sure GPU received all the data (slower) //and that any GPU error comes from last block sent. static int pb_disable_gpu=0; //if set, prevents GPU from delaying CPU when FIFO is //full (allows to see how fast CPU code is fast alone) static KINTERRUPT pb_InterruptObject; static KDPC pb_DPCObject; static HANDLE pb_VBlankEvent; static DWORD pb_OldMCEnable; static DWORD pb_OldMCInterrupt; static DWORD pb_OldFBConfig0; static DWORD pb_OldFBConfig1; static DWORD pb_OldVideoStart; static DWORD *pb_DmaBuffer8; //points at 32 contiguous bytes (Dma Channel ID 8 buffer) static DWORD *pb_DmaBuffer2; //points at 32 contiguous bytes (Dma Channel ID 2 buffer) static DWORD *pb_DmaBuffer7; //points at 32 contiguous bytes (Dma Channel ID 7 buffer) static DWORD pb_Size=512*1024;//push buffer size, must be >64Kb and a power of 2 static DWORD *pb_Head; //points at push buffer head static DWORD *pb_Tail; //points at push buffer tail static DWORD *pb_Put=NULL; //where next command+params are to be written static float pb_CpuFrequency; static DWORD pb_GpuInstMem; static DWORD pb_PushBase; static DWORD pb_PushLimit; static DWORD pb_FifoHTAddr; static DWORD pb_FifoFCAddr; static DWORD pb_FifoU1Addr; static DWORD pb_3DGrCtxInst[2]={0,0};//Adress of the two 3D graphic contexts (addr=inst<<4+NV_PRAMIN) static DWORD pb_GrCtxTableInst; //Adress of the table that points at the two graphic contexts static DWORD pb_GrCtxInst[2]; //Adress of the two graphic contexts (addr=inst<<4+NV_PRAMIN) static int pb_GrCtxID; //Current context ID : 0,1 or NONE static DWORD pb_FifoBigInst; //graphic contexts are stored there, and much more (addr=inst<<4+NV_PRAMIN) static DWORD pb_FreeInst; //next free space in PRAMIN area (addr=inst<<4+NV_PRAMIN) static int pb_GammaRampIdx=0; static int pb_GammaRampbReady[3]={0,0,0}; static BYTE pb_GammaRamp[3][3][256]; static int pb_BackBufferNxt=0; static int pb_BackBufferNxtVBL=0; static int pb_BackBufferbReady[3]={0,0,0}; static int pb_BackBufferIndex[3]; static DWORD pb_FifoChannelsReady=0; static DWORD pb_FifoChannelsMode=NV_PFIFO_MODE_ALL_PIO; static DWORD pb_FifoChannelID=0; static DWORD pb_PutRunSize=0; static DWORD pb_GetRunSize; static DWORD pb_FrameBuffersCount; static DWORD pb_FrameBuffersWidth; static DWORD pb_FrameBuffersHeight; static DWORD pb_FrameBuffersAddr; static DWORD pb_FrameBuffersPitch; static DWORD pb_FBAddr[3]; //frame buffers addresses static DWORD pb_FBSize; //size of 1 buffer static DWORD pb_FBGlobalSize; //size of all buffers static DWORD pb_FBVFlag; static DWORD pb_GPUFrameBuffersFormat;//encoded format for GPU static DWORD pb_EXAddr[8]; //extra buffers addresses static DWORD pb_ExtraBuffersCount=0; static DWORD pb_DepthStencilAddr; static DWORD pb_DepthStencilPitch; static int pb_DepthStencilLast; static DWORD pb_DSAddr; //depth stencil address static DWORD pb_DSSize; //size of depth stencil buffer static DWORD pb_GPUDepthStencilFormat;//encoded format for GPU static int pb_front_index; static int pb_back_index; static DWORD pb_Viewport_x; static DWORD pb_Viewport_y; static DWORD pb_Viewport_width; static DWORD pb_Viewport_height; static DWORD pb_Viewport_zmin; static DWORD pb_Viewport_zmax; static float pb_XScale; static float pb_YScale; static float pb_ZScale; static float pb_GlobalScale; static float pb_Bias; static int pb_debug_screen_active; static DWORD pb_DmaChID9Inst; static DWORD pb_DmaChID10Inst; static DWORD pb_DmaChID11Inst; static DWORD *pb_DmaUserAddr; static DWORD pb_PushIndex; static DWORD *pb_PushStart; static DWORD *pb_PushNext; static int pb_BeginEndPair=0; static float pb_FixedPipelineConstants[12]={ 0.0f, 0.5f, 1.0f, 2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 0.0f, 0.0f, -1.0f, 0.0f }; static float pb_IdentityMatrix[16]={ 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f }; static DWORD pb_TilePitches[16]={ 0x0200,0x0400,0x0600,0x0800, 0x0A00,0x0C00,0x0E00,0x1000, 0x1400,0x1800,0x1C00,0x2800, 0x3000,0x3800,0x5000,0x7000 }; static float pb_BiasTable[7]={ 0.0f, 0.585f, 1.0f, 1.322f, 1.585f, 1.907f, 2.0f }; //temporary storage for pb_pcode2mcode() static DWORD pb_gpu_programnc[136*5+192*7+8];//vertex shader micro-code setup (max:136 instructions + 192 constants) static DWORD pb_gpu_registers[6*8+7];//pixel shader registers values static int pb_tmp_registers[16];//some vertex shader macros need to find free temp registers static int pb_exp_constflag; static int pb_log_constflag; //forward references static void pb_load_gr_ctx(int ctx_id); //private pb_text_screen functions #define ROWS 16 #define COLS 60 static char pb_text_screen[ROWS][COLS]; static int pb_next_row=0; static int pb_next_col=0; static unsigned char systemFont[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,56,56,56,56,56,0,56,56, 108,108,0,0,0,0,0,0,0,108,254,254,108,254,254,108, 48,126,224,124,14,254,252,48,98,230,204,24,48,102,206,140, 120,220,252,120,250,222,252,118,28,28,56,0,0,0,0,0, 14,28,28,28,28,28,28,14,112,56,56,56,56,56,56,112, 0,0,0,230,124,56,124,206,0,0,28,28,127,127,28,28, 0,0,0,0,0,28,28,56,0,0,0,0,124,124,0,0, 0,0,0,0,0,0,56,56,28,28,56,56,112,112,224,224, 124,254,238,238,238,254,254,124,56,120,248,56,56,254,254,254, 252,254,14,60,112,254,254,254,252,254,14,60,14,254,254,252, 238,238,238,254,254,14,14,14,254,254,224,252,14,254,254,252, 124,252,224,252,238,254,254,124,252,254,14,14,28,28,56,56, 124,254,238,124,238,254,254,124,124,254,238,126,14,254,254,252, 0,0,28,28,0,28,28,28,0,0,28,28,0,28,28,56, 6,14,28,56,56,28,14,6,0,0,124,124,0,124,124,124, 112,56,28,14,14,28,56,112,124,254,206,28,56,0,56,56, 124,198,190,182,190,182,200,126,124,254,238,254,238,238,238,238, 252,254,206,252,206,254,254,252,124,254,238,224,238,254,254,124, 252,254,238,238,238,254,254,252,254,254,224,248,224,254,254,254, 126,254,224,248,224,224,224,224,126,254,224,238,238,254,254,124, 238,238,238,254,238,238,238,238,254,254,56,56,56,254,254,254, 254,254,14,14,238,254,254,124,238,238,252,248,252,238,238,238, 224,224,224,224,224,254,254,126,130,198,238,254,254,238,238,238, 206,238,254,254,254,254,238,230,124,254,238,238,238,254,254,124, 252,254,238,238,252,224,224,224,124,254,238,238,254,254,252,118, 252,254,238,238,252,238,238,238,126,254,224,124,14,254,254,252, 254,254,56,56,56,56,56,56,238,238,238,238,238,254,254,124, 238,238,238,238,238,238,124,56,238,238,238,254,254,238,198,130, 238,238,124,56,124,238,238,238,238,238,124,124,56,56,112,112, 254,254,28,56,112,254,254,254,124,124,112,112,112,124,124,124, 112,112,56,56,28,28,14,14,124,124,28,28,28,124,124,124, 56,124,238,198,0,0,0,0,0,0,0,0,0,254,254,254, 56,56,28,0,0,0,0,0,0,124,254,238,254,238,238,238, 0,252,254,206,252,206,254,252,0,124,254,238,224,238,254,124, 0,252,254,238,238,238,254,252,0,254,254,224,248,224,254,254, 0,126,254,224,248,224,224,224,0,126,254,224,238,238,254,124, 0,238,238,238,254,238,238,238,0,254,254,56,56,56,254,254, 0,254,254,14,14,238,254,124,0,238,238,252,248,252,238,238, 0,224,224,224,224,224,254,126,0,130,198,238,254,254,238,238, 0,206,238,254,254,254,238,230,0,124,254,238,238,238,254,124, 0,252,254,238,238,252,224,224,0,124,254,238,238,254,252,118, 0,252,254,238,238,252,238,238,0,126,254,224,124,14,254,252, 0,254,254,56,56,56,56,56,0,238,238,238,238,238,254,124, 0,238,238,238,238,238,124,56,0,238,238,238,254,238,198,130, 0,238,238,124,56,124,238,238,0,238,238,124,124,56,56,112, 0,254,254,28,56,112,254,254,60,124,112,112,112,124,124,60, 56,56,56,0,56,56,56,56,120,124,28,28,28,124,124,120, 236,254,118,0,0,0,0,0,0,16,56,124,254,254,254,254, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; static void pb_scrollup(void) { int i; for(i=0;i=ROWS) { pb_next_row=ROWS-1; pb_scrollup(); } pb_next_col=0; } else if (c=='\r') { pb_next_col=0; } else if (c==8) { pb_next_col--; if (pb_next_col<0) pb_next_col=0; } else if (c>=32) { pb_text_screen[pb_next_row][pb_next_col]=c; pb_next_col++; if (pb_next_col>=COLS) { pb_next_row++; if (pb_next_row>=ROWS) { pb_next_row=ROWS-1; pb_scrollup(); } pb_next_col=0; } } } //private functions static void pb_set_gamma_ramp(BYTE *pGammaRamp) { int i; VIDEOREG8(NV_USER_DAC_WRITE_MODE_ADDRESS)=0; //&NV_USER_DAC_WRITE_MODE_ADDRESS_VALUE for(i=0;i<256;i++) { VIDEOREG8(NV_USER_DAC_PALETTE_DATA)=pGammaRamp[i]; //&NV_USER_DAC_PALETTE_DATA_VALUE VIDEOREG8(NV_USER_DAC_PALETTE_DATA)=pGammaRamp[i+256]; //&NV_USER_DAC_PALETTE_DATA_VALUE VIDEOREG8(NV_USER_DAC_PALETTE_DATA)=pGammaRamp[i+512]; //&NV_USER_DAC_PALETTE_DATA_VALUE } } static void pb_vbl_handler(void) { BYTE old_color_addr; //important index to preserve if we are called from Dpc or Isr int flag; int next; int index; old_color_addr=VIDEOREG8(NV_PRMCIO_CRX__COLOR); pb_vbl_counter++; //Index of next back buffer to show up (0-4) next=pb_BackBufferNxtVBL; //Is the next back buffer to show up is ready? if (pb_BackBufferbReady[next]==1) { //screen swapping has been done already, theoretically, in ISR pb_BackBufferbReady[next]=0; index=pb_GammaRampIdx; if (pb_GammaRampbReady[index]) { pb_set_gamma_ramp(&pb_GammaRamp[index][0][0]); pb_GammaRampbReady[index]=0; index=(index+1)%3; pb_GammaRampIdx=index; } VIDEOREG(NV_PGRAPH_INCREMENT)|=NV_PGRAPH_INCREMENT_READ_3D_TRIGGER; //rotate next back buffer & gamma ramp index next=(next+1)%3; pb_BackBufferNxtVBL=next; } do { VIDEOREG(PCRTC_INTR)=PCRTC_INTR_VBLANK_RESET; }while(VIDEOREG(NV_PMC_INTR_0)&NV_PMC_INTR_0_PCRTC_PENDING); NtPulseEvent(pb_VBlankEvent, NULL); // if (UserCallback) UserCallback(); //user callback must be brief and preserve fpu state VIDEOREG8(NV_PRMCIO_CRX__COLOR)=old_color_addr; //restore color index } static void pb_cache_flush(void) { __asm__ __volatile__ ("sfence"); //assembler instruction "sfence" : waits end of previous instructions VIDEOREG(NV_PFB_WC_CACHE)|=NV_PFB_WC_CACHE_FLUSH_TRIGGER; while(VIDEOREG(NV_PFB_WC_CACHE)&NV_PFB_WC_CACHE_FLUSH_IN_PROGRESS) {}; } static void pb_subprog(DWORD subprogID, DWORD paramA, DWORD paramB) { //inner registers 0x1D8C & 0x1D90 match 2 outer registers : //[0x1D8C]=[NV20_TCL_PRIMITIVE_3D_PARAMETER_A]=VIDEOREG(NV_PGRAPH_PARAMETER_A)=[0xFD401A88] //[0x1D90]=[NV20_TCL_PRIMITIVE_3D_PARAMETER_B]=VIDEOREG(NV_PGRAPH_PARAMETER_B)=[0xFD40186C] //so they can be used by a push buffer sequence to set parameters //before triggering a subprogram by the command 0x0100 which will //throw an interrupt and have CPU execute its code right here. //Here just test the subprogID value and execute your own subprogram //associated code (avoid using subprogID=0, it seems to be reserved) int next; switch(subprogID) { case PB_SETOUTER: //sets an outer register VIDEOREG(paramA)=paramB; break; case PB_SETNOISE: //Dxt1NoiseEnable: copy paramA in NV_PGRAPH_RDI(sel 0xE0 adr 0x50 & sel 0xDF adr 0x08) VIDEOREG(NV_PGRAPH_RDI_INDEX)=((0xE0<<16)&NV_PGRAPH_RDI_INDEX_SELECT)|((0x50)&NV_PGRAPH_RDI_INDEX_ADDRESS); VIDEOREG(NV_PGRAPH_RDI_DATA)=paramA; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((0xDF<<16)&NV_PGRAPH_RDI_INDEX_SELECT)|((0x08)&NV_PGRAPH_RDI_INDEX_ADDRESS); VIDEOREG(NV_PGRAPH_RDI_DATA)=paramA; break; case PB_FINISHED: //warns that all drawing has been finished for the frame next=pb_BackBufferNxt; pb_BackBufferIndex[next]=paramA; pb_BackBufferbReady[next]=1; next=(next+1)%3; pb_BackBufferNxt=next; break; default: debugPrint( "Unknown subProgID %d has been detected by DPC (A=%x B=%x).\n", subprogID, paramA, paramB ); break; } } static DWORD pb_gr_handler(void) { DWORD status; DWORD trapped_address; int trapped_ctx_id; DWORD nsource; DWORD GrClass; DWORD DataLow; int i; DWORD *p; VIDEOREG(NV_PGRAPH_FIFO)=NV_PGRAPH_FIFO_ACCESS_DISABLE; status=VIDEOREG(NV_PGRAPH_INTR); trapped_address=VIDEOREG(NV_PGRAPH_TRAPPED_ADDR); nsource=VIDEOREG(NV_PGRAPH_NSOURCE); trapped_ctx_id=(trapped_address&NV_PGRAPH_TRAPPED_ADDR_CHID)>>20; trapped_address&=NV_PGRAPH_TRAPPED_ADDR_MTHD; if (status&NV_PGRAPH_INTR_CONTEXT_SWITCH_PENDING) { VIDEOREG(NV_PGRAPH_INTR)=NV_PGRAPH_INTR_CONTEXT_SWITCH_RESET; while(VIDEOREG(NV_PGRAPH_STATUS)); pb_load_gr_ctx(trapped_ctx_id); } if (status&NV_PGRAPH_INTR_MISSING_HW_PENDING) { VIDEOREG(NV_PGRAPH_INTR)=NV_PGRAPH_INTR_MISSING_HW_RESET; } if ( (status&NV_PGRAPH_INTR_NOTIFY_PENDING)|| (status&NV_PGRAPH_INTR_ERROR_PENDING) ) { if (nsource&NV_PGRAPH_NSOURCE_ILLEGAL_MTHD_PENDING) { if (status&NV_PGRAPH_INTR_NOTIFY_PENDING) VIDEOREG(NV_PGRAPH_INTR)= NV_PGRAPH_INTR_NOTIFY_RESET| NV_PGRAPH_INTR_ERROR_RESET| NV_PGRAPH_INTR_SINGLE_STEP_RESET| NV_PGRAPH_INTR_MORE_RESET; else VIDEOREG(NV_PGRAPH_INTR)=NV_PGRAPH_INTR_ERROR_RESET; } } status=VIDEOREG(NV_PGRAPH_INTR); if (status) { VIDEOREG(NV_PGRAPH_INTR)=NV_PGRAPH_INTR_CONTEXT_SWITCH_RESET; if ( (status!=NV_PGRAPH_INTR_CONTEXT_SWITCH_PENDING)&& (status!=NV_PGRAPH_INTR_SINGLE_STEP_PENDING) ) { if (status&NV_PGRAPH_INTR_MISSING_HW_PENDING) { while(VIDEOREG(NV_PGRAPH_STATUS)) {}; } if (nsource) { if ( (status&NV_PGRAPH_INTR_NOTIFY_PENDING)|| (status&NV_PGRAPH_INTR_ERROR_PENDING) ) { GrClass=VIDEOREG(NV_PGRAPH_CTX_SWITCH1)&NV_PGRAPH_CTX_SWITCH1_GRCLASS; DataLow=VIDEOREG(NV_PGRAPH_TRAPPED_DATA_LOW); //&NV_PGRAPH_TRAPPED_DATA_LOW_VALUE if ((nsource&NV_PGRAPH_NSOURCE_ILLEGAL_MTHD_PENDING)==0) { if (trapped_address==0x0100) { //The following line may be a bad idea. But without it, interrupt fires permanently... VIDEOREG(NV_PGRAPH_INTR)=NV_PGRAPH_INTR_ERROR_RESET; //calls subprogram pb_subprog(DataLow,VIDEOREG(NV_PGRAPH_PARAMETER_A),VIDEOREG(NV_PGRAPH_PARAMETER_B)); } else { pb_show_debug_screen(); debugPrint("\n"); if (nsource&NV_PGRAPH_NSOURCE_DATA_ERROR_PENDING) debugPrint("GPU Error : invalid data error!\n"); if (nsource&NV_PGRAPH_NSOURCE_PROTECTION_ERROR_PENDING) debugPrint("GPU Error : protection error!\n"); if (nsource&NV_PGRAPH_NSOURCE_RANGE_EXCEPTION_PENDING) debugPrint("GPU Error : range exception error!\n"); if (nsource&NV_PGRAPH_NSOURCE_LIMIT_COLOR_PENDING) debugPrint("GPU Error : color buffer limit error!\n"); if (nsource&NV_PGRAPH_NSOURCE_LIMIT_ZETA_PENDING) debugPrint("GPU Error : zeta buffer limit error!\n"); if (nsource&NV_PGRAPH_NSOURCE_DMA_R_PROTECTION_PENDING) debugPrint("GPU Error : dma read protection error!\n"); if (nsource&NV_PGRAPH_NSOURCE_DMA_W_PROTECTION_PENDING) debugPrint("GPU Error : dma write protection error!\n"); if (nsource&NV_PGRAPH_NSOURCE_FORMAT_EXCEPTION_PENDING) debugPrint("GPU Error : format exception error!\n"); if (nsource&NV_PGRAPH_NSOURCE_PATCH_EXCEPTION_PENDING) debugPrint("GPU Error : patch exception error!\n"); if (nsource&NV_PGRAPH_NSOURCE_STATE_INVALID_PENDING) debugPrint("GPU Error : object state invalid error!\n"); if (nsource&NV_PGRAPH_NSOURCE_DOUBLE_NOTIFY_PENDING) debugPrint("GPU Error : double notify error!\n"); if (nsource&NV_PGRAPH_NSOURCE_NOTIFY_IN_USE_PENDING) debugPrint("GPU Error : notify in use error!\n"); if (nsource&NV_PGRAPH_NSOURCE_METHOD_CNT_PENDING) debugPrint("GPU Error : method count error!\n"); if (nsource&NV_PGRAPH_NSOURCE_BFR_NOTIFICATION_PENDING) debugPrint("GPU Error : buffer notification error!\n"); if (nsource&NV_PGRAPH_NSOURCE_DMA_VTX_PROTECTION_PENDING) debugPrint("GPU Error : DMA vertex protection error!\n"); if (nsource&NV_PGRAPH_NSOURCE_IDX_INLINE_REUSE_PENDING) debugPrint("Graphics index inline reuse error!\n"); if (nsource&NV_PGRAPH_NSOURCE_INVALID_OPERATION_PENDING) debugPrint("GPU Error : invalid operation error!\n"); if (nsource&NV_PGRAPH_NSOURCE_FD_INVALID_OPERATION_PENDING) debugPrint("GPU Error : FD invalid operation error!\n"); if (nsource&NV_PGRAPH_NSOURCE_TEX_A_PROTECTION_PENDING) debugPrint("GPU Error : texture A protection error!\n"); if (nsource&NV_PGRAPH_NSOURCE_TEX_B_PROTECTION_PENDING) debugPrint("GPU Error : texture B protection error!\n"); debugPrint( "Error binary flags : %08x\n" "Channel ID : %d (0=3D)\n" "Channel class : %x\n" "Push buffer inner register target : %04x\n" "Push buffer data (lo) or instance : %08x\n" "Push buffer data (hi) or instance : %08x\n" "Multi-purpose register A [0x1D8C] : %08x\n" "Multi-purpose register B [0x1D90] : %08x\n\n", nsource, trapped_ctx_id, GrClass, trapped_address, DataLow, VIDEOREG(NV_PGRAPH_TRAPPED_DATA_HIGH), VIDEOREG(NV_PGRAPH_PARAMETER_A), VIDEOREG(NV_PGRAPH_PARAMETER_B) ); if (pb_trace_mode==0) debugPrint("Report is accurate only if pb_trace_mode=1 (slower)\n"); debugPrint("System halted\n"); //calling XReboot() from here doesn't work well. // Halt the system with these instructions, so the CPU can idle. __asm__ ( "cli\n" "hlt"); } } } } if (status&NV_PGRAPH_INTR_BUFFER_NOTIFY_PENDING) { while (VIDEOREG(NV_PGRAPH_STATUS)) {}; } } } VIDEOREG(NV_PGRAPH_FIFO)=NV_PGRAPH_FIFO_ACCESS_ENABLE; return VIDEOREG(NV_PGRAPH_INTR); } static void pb_wait_until_gr_not_busy(void) { DWORD status; while(VIDEOREG(NV_PGRAPH_STATUS)!=NV_PGRAPH_STATUS_NOT_BUSY) { status=VIDEOREG(NV_PMC_INTR_0); if (status&NV_PMC_INTR_0_PGRAPH_PENDING) pb_gr_handler(); if (status&NV_PMC_INTR_0_PCRTC_PENDING) pb_vbl_handler(); } } static void pb_load_gr_ctx(int ctx_id) { DWORD old_fifo_access; DWORD dummy; int i; if (VIDEOREG(NV_PGRAPH_INTR)!=NV_PGRAPH_INTR_NOT_PENDING) pb_gr_handler(); old_fifo_access=VIDEOREG(NV_PGRAPH_FIFO); VIDEOREG(NV_PGRAPH_FIFO)=NV_PGRAPH_FIFO_ACCESS_DISABLE; pb_wait_until_gr_not_busy(); if ((ctx_id!=pb_GrCtxID)&&(ctx_id!=NONE)) { VIDEOREG(NV_PGRAPH_CHANNEL_CTX_POINTER)=pb_GrCtxInst[ctx_id]&NV_PGRAPH_CHANNEL_CTX_POINTER_INST; VIDEOREG(NV_PGRAPH_CHANNEL_CTX_STATUS)=NV_PGRAPH_CHANNEL_CTX_STATUS_UNLOADED; pb_wait_until_gr_not_busy(); VIDEOREG(NV_PGRAPH_CTX_CONTROL)=NV_PGRAPH_CTX_CONTROL_DEVICE_ENABLED; } pb_GrCtxID=ctx_id; if (ctx_id==NONE) { VIDEOREG(NV_PGRAPH_CTX_CONTROL)=NV_PGRAPH_CTX_CONTROL_DEVICE_ENABLED|NV_PGRAPH_CTX_CONTROL_TIME_NOT_EXPIRED; VIDEOREG(NV_PGRAPH_FFINTFC_ST2)=NV_PGRAPH_FFINTFC_ST2_CHID_STATUS_VALID; VIDEOREG(NV_PGRAPH_FIFO)=old_fifo_access|NV_PGRAPH_FIFO_ACCESS_ENABLE; } else { if (pb_3DGrCtxInst[ctx_id]) { VIDEOREG(NV_PGRAPH_DEBUG_0) = NV_PGRAPH_DEBUG_0_IDX_STATE_RESET| NV_PGRAPH_DEBUG_0_VTX_STATE_RESET| NV_PGRAPH_DEBUG_0_CAS_STATE_RESET; dummy=VIDEOREG(NV_PGRAPH_DEBUG_0); VIDEOREG(NV_PGRAPH_DEBUG_0)=NV_PGRAPH_DEBUG_0_NO_RESET; dummy=VIDEOREG(NV_PGRAPH_DEBUG_0); VIDEOREG(NV_PGRAPH_RDI_INDEX)=((0x00)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0x3D<<16)&NV_PGRAPH_RDI_INDEX_SELECT); for(i=0;i<15;i++) VIDEOREG(NV_PGRAPH_RDI_DATA)=0; } VIDEOREG(NV_PGRAPH_DEBUG_1)|=NV_PGRAPH_DEBUG_1_CACHE_INVALIDATE; VIDEOREG(NV_PGRAPH_CTX_USER)=(ctx_id<<24)&NV_PGRAPH_CTX_USER_CHID; VIDEOREG(NV_PGRAPH_CHANNEL_CTX_POINTER)=pb_GrCtxInst[ctx_id]&NV_PGRAPH_CHANNEL_CTX_POINTER_INST; VIDEOREG(NV_PGRAPH_CHANNEL_CTX_STATUS)=NV_PGRAPH_CHANNEL_CTX_STATUS_LOADED; pb_wait_until_gr_not_busy(); VIDEOREG(NV_PGRAPH_CTX_USER)=(VIDEOREG(NV_PGRAPH_CTX_USER)&~NV_PGRAPH_CTX_USER_CHID)|((ctx_id<<24)&NV_PGRAPH_CTX_USER_CHID); VIDEOREG(NV_PGRAPH_CTX_CONTROL) = NV_PGRAPH_CTX_CONTROL_TIME_NOT_EXPIRED| NV_PGRAPH_CTX_CONTROL_CHID_VALID| NV_PGRAPH_CTX_CONTROL_DEVICE_ENABLED; VIDEOREG(NV_PGRAPH_FFINTFC_ST2)&=(NV_PGRAPH_FFINTFC_ST2_CHSWITCH_CLEAR&NV_PGRAPH_FFINTFC_ST2_FIFOHOLD_CLEAR); } } static DWORD pb_fifo_handler(void) { DWORD i; DWORD status; DWORD pull; DWORD get_address; int skip_waiting; skip_waiting=0; status=VIDEOREG(NV_PFIFO_INTR_0); if (status&NV_PFIFO_INTR_0_SEMAPHORE_PENDING) { VIDEOREG(NV_PFIFO_INTR_0)=NV_PFIFO_INTR_0_SEMAPHORE_RESET; } if (status&NV_PFIFO_INTR_0_ACQUIRE_TIMEOUT_PENDING) { VIDEOREG(NV_PFIFO_INTR_0)=NV_PFIFO_INTR_0_ACQUIRE_TIMEOUT_RESET; } status=VIDEOREG(NV_PFIFO_INTR_0); if (status&NV_PFIFO_INTR_0_CACHE_ERROR_PENDING) { pull=VIDEOREG(NV_PFIFO_CACHE1_PULL0); get_address=VIDEOREG(NV_PFIFO_CACHE1_GET); //&NV_PFIFO_CACHE1_GET_ADDRESS (0x3FC) get_address>>=2; VIDEOREG(NV_PFIFO_CACHES)=NV_PFIFO_CACHES_ALL_DISABLE; VIDEOREG(NV_PFIFO_CACHE1_PULL0)=NV_PFIFO_CACHE1_PULL0_ACCESS_DISABLE; VIDEOREG(NV_PFIFO_INTR_0)=NV_PFIFO_INTR_0_CACHE_ERROR_RESET; for(i=0;i<65535;i++) { if ((pull&NV_PFIFO_CACHE1_PULL0_HASH_STATE_BUSY)==0) break; pull=VIDEOREG(NV_PFIFO_CACHE1_PULL0); } if ( (pull&NV_PFIFO_CACHE1_PULL0_DEVICE_SOFTWARE)|| (pull&NV_PFIFO_CACHE1_PULL0_HASH_FAILED) ) { VIDEOREG(NV_PFIFO_CACHE1_GET)=((get_address+1)<<2)&NV_PFIFO_CACHE1_GET_ADDRESS; } VIDEOREG(NV_PFIFO_CACHE1_HASH)=0; //&NV_PFIFO_CACHE1_HASH_INSTANCE VIDEOREG(NV_PFIFO_CACHE1_PULL0)=NV_PFIFO_CACHE1_PULL0_ACCESS_ENABLE; VIDEOREG(NV_PFIFO_CACHES)=NV_PFIFO_CACHES_REASSIGN_ENABLED; } if (status&NV_PFIFO_INTR_0_DMA_PUSHER_PENDING) { pb_show_debug_screen(); debugPrint("Software Put=%08x\n",pb_Put); debugPrint("Hardware Put=%08x\n",VIDEOREG(NV_PFIFO_CACHE1_DMA_PUT)); debugPrint("Hardware Get=%08x\n",VIDEOREG(NV_PFIFO_CACHE1_DMA_GET)); debugPrint("Dma push buffer engine encountered invalid data at these addresses.\n"); VIDEOREG(NV_PFIFO_INTR_0)=NV_PFIFO_INTR_0_DMA_PUSHER_RESET; VIDEOREG(NV_PFIFO_CACHE1_DMA_STATE)=NV_PFIFO_CACHE1_DMA_STATE_METHOD_COUNT_0; if (VIDEOREG(NV_PFIFO_CACHE1_DMA_PUT)!=VIDEOREG(NV_PFIFO_CACHE1_DMA_GET)) VIDEOREG(NV_PFIFO_CACHE1_DMA_GET)+=(1<<2); } if (status&NV_PFIFO_INTR_0_DMA_PT_PENDING) { VIDEOREG(NV_PFIFO_INTR_0)=NV_PFIFO_INTR_0_DMA_PT_RESET; } if (VIDEOREG(NV_PFIFO_CACHE1_DMA_PUSH)&NV_PFIFO_CACHE1_DMA_PUSH_STATE_BUSY) { if ((VIDEOREG8(NV_PFIFO_CACHE1_STATUS)&NV_PFIFO_CACHE1_STATUS_LOW_MARK_EMPTY)==0) do { if (VIDEOREG(NV_PFIFO_INTR_0)==NV_PFIFO_INTR_0_NOT_PENDING) { if (VIDEOREG(NV_PGRAPH_INTR)) pb_fifo_handler(); if (VIDEOREG(NV_PMC_INTR_0)&NV_PMC_INTR_0_PCRTC_PENDING) pb_vbl_handler(); if ((VIDEOREG8(NV_PFIFO_CACHE1_STATUS)&NV_PFIFO_CACHE1_STATUS_LOW_MARK_EMPTY)==0) continue; //jump to loop start } if ((VIDEOREG8(NV_PFIFO_CACHE1_STATUS)&NV_PFIFO_CACHE1_STATUS_LOW_MARK_EMPTY)==0) { skip_waiting=1; break; } }while(VIDEOREG8(NV_PFIFO_CACHE1_STATUS)&NV_PFIFO_CACHE1_STATUS_LOW_MARK_EMPTY); if (skip_waiting==0) { //wait while(VIDEOREG8(NV_PFIFO_CACHES)&NV_PFIFO_CACHES_DMA_SUSPEND_BUSY); VIDEOREG(NV_PFIFO_CACHE1_DMA_PUSH)&=NV_PFIFO_CACHE1_DMA_PUSH_STATUS_RUNNING; } } if (VIDEOREG(NV_PFIFO_INTR_0)==NV_PFIFO_INTR_0_NOT_PENDING) { VIDEOREG(NV_PFIFO_CACHE1_PULL0)=NV_PFIFO_CACHE1_PULL0_ACCESS_ENABLE; VIDEOREG(NV_PFIFO_CACHES)=NV_PFIFO_CACHES_REASSIGN_ENABLED; } return VIDEOREG(NV_PFIFO_INTR_0)|(VIDEOREG(NV_PFIFO_DEBUG_0)&NV_PFIFO_DEBUG_0_CACHE_ERROR0_PENDING); } static void pb_set_fifo_channel(int channel) { DWORD old_caches,old_push,old_pull,old_channel; DWORD *p; DWORD pending_flags; old_caches=VIDEOREG(NV_PFIFO_CACHES); old_push=VIDEOREG(NV_PFIFO_CACHE1_PUSH0); old_pull=VIDEOREG(NV_PFIFO_CACHE1_PULL0); VIDEOREG(NV_PFIFO_CACHES)=NV_PFIFO_CACHES_ALL_DISABLE; VIDEOREG(NV_PFIFO_CACHE1_PUSH0)=NV_PFIFO_CACHE1_PUSH0_ACCESS_DISABLE; VIDEOREG(NV_PFIFO_CACHE1_PULL0)=NV_PFIFO_CACHE1_PULL0_ACCESS_DISABLE; old_channel=VIDEOREG(NV_PFIFO_CACHE1_PUSH1)&NV_PFIFO_CACHE1_PUSH1_CHID; //backup old channel details into PRAMIN area p=(DWORD *)(VIDEO_BASE+pb_FifoFCAddr+old_channel*64); *(p+0)=VIDEOREG(NV_PFIFO_CACHE1_DMA_PUT); //&NV_PFIFO_CACHE1_DMA_PUT_OFFSET *(p+1)=VIDEOREG(NV_PFIFO_CACHE1_DMA_GET); //&NV_PFIFO_CACHE1_DMA_GET_OFFSET *(p+2)=VIDEOREG(NV_PFIFO_CACHE1_REF); //&NV_PFIFO_CACHE1_REF_CNT *(p+3)=VIDEOREG(NV_PFIFO_CACHE1_DMA_INSTANCE); //&NV_PFIFO_CACHE1_DMA_INSTANCE_ADDRESS *(p+4)=VIDEOREG(NV_PFIFO_CACHE1_DMA_STATE); *(p+5)=VIDEOREG(NV_PFIFO_CACHE1_DMA_FETCH); *(p+6)=VIDEOREG(NV_PFIFO_CACHE1_ENGINE); *(p+7)=VIDEOREG(NV_PFIFO_CACHE1_PULL1); *(p+8)=VIDEOREG(NV_PFIFO_CACHE1_ACQUIRE_2); //&NV_PFIFO_CACHE1_ACQUIRE_2_VALUE *(p+9)=VIDEOREG(NV_PFIFO_CACHE1_ACQUIRE_1); //&NV_PFIFO_CACHE1_ACQUIRE_1_TIMESTAMP *(p+10)=VIDEOREG(NV_PFIFO_CACHE1_ACQUIRE_0); //&NV_PFIFO_CACHE1_ACQUIRE_0_TIMEOUT *(p+11)=VIDEOREG(NV_PFIFO_CACHE1_SEMAPHORE); *(p+12)=VIDEOREG(NV_PFIFO_CACHE1_DMA_SUBROUTINE); if (VIDEOREG(NV_PFIFO_CACHE1_PUSH1)&NV_PFIFO_CACHE1_PUSH1_MODE_DMA) { pending_flags=VIDEOREG(NV_PFIFO_DMA); pending_flags&=~(1<>2)|0x80000000; if (tile_flags&0x04000000) EncodedZStartTag|=0x04000000; //points tile Zcomp in NV_PFB pZcomp=(DWORD *)(VIDEO_BASE+NV_PFB_ZCOMP+tile_index*4); //points tile Zcomp in NV_PGRAPH p=(DWORD *)(VIDEO_BASE+NV_PGRAPH_ZCOMP_XBOX+tile_index*4); //points tile Zcomp in NV_PGRAPH_RDI(0x90) addr90=((tile_index*4+0x90)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); do { pb_wait_until_gr_not_busy(); *(pZcomp+0)=EncodedZStartTag; *(p+0)=EncodedZStartTag; VIDEOREG(NV_PGRAPH_RDI_INDEX)=addr90; VIDEOREG(NV_PGRAPH_RDI_DATA)=EncodedZStartTag; }while (*(pZcomp+0)!=*(p+0)); if (tile_z_offset) { EncodedZOffset=tile_z_offset|tile_index|0x80000000; do { pb_wait_until_gr_not_busy(); VIDEOREG(NV_PFB_ZCOMP_OFFSET)=EncodedZOffset; VIDEOREG(NV_PGRAPH_ZCOMP_OFFSET_XBOX)=EncodedZOffset; }while(VIDEOREG(NV_PFB_ZCOMP_OFFSET)!=VIDEOREG(NV_PGRAPH_ZCOMP_OFFSET_XBOX)); } } VIDEOREG(NV_PFIFO_CACHE1_DMA_PUSH)=old_dma_push; } static void pb_prepare_tiles(void) { DWORD *pTile; DWORD *pTlimit; DWORD *pTsize; DWORD *pZcomp; DWORD Tile; DWORD Tlimit; DWORD Tsize; DWORD Zcomp; DWORD Zcomp_offset; DWORD Config0; DWORD Config1; DWORD *p; int i; p=(DWORD *)(VIDEO_BASE+NV_PGRAPH_TILE_XBOX); pTlimit=(DWORD *)(VIDEO_BASE+NV_PFB_TLIMIT); pTsize=(DWORD *)(VIDEO_BASE+NV_PFB_TSIZE); pTile=(DWORD *)(VIDEO_BASE+NV_PFB_TILE); //Copy 8 Tiles details from NV_PFB to NV_PGRAPH and to NV_PGRAPH_RDI(0x10) for(i=0x10;i<0x30;i+=4) { Tile=*(pTile+0); *(p+0)=Tile; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((i+0x00)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); VIDEOREG(NV_PGRAPH_RDI_DATA)=Tile; Tlimit=*(pTlimit+0); *(p+1)=Tlimit; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((i+0x20)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); VIDEOREG(NV_PGRAPH_RDI_DATA)=Tlimit; Tsize=*(pTsize+0); *(p+2)=Tsize; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((i+0x40)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); VIDEOREG(NV_PGRAPH_RDI_DATA)=Tsize; p+=4; //move 16 bytes forward pTile+=4; pTlimit+=4; pTsize+=4; } p=(DWORD *)(VIDEO_BASE+NV_PGRAPH_ZCOMP_XBOX); pZcomp=(DWORD *)(VIDEO_BASE+NV_PFB_ZCOMP); //Copy 8 Tiles Zcomp from NV_PFB to NV_PGRAPH and to NV_PGRAPH_RDI(0x90) for(i=0x90;i<0x110;i+=4) { Zcomp=*(pZcomp+0); *(p+0)=Zcomp; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((i+0x00)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); VIDEOREG(NV_PGRAPH_RDI_DATA)=Tsize; p++; //move 4 bytes forward pZcomp++; } //Copy 3 parameters from NV_PFB to NV_PGRAPH and to NV_PGRAPH_RDI(sel 0xEA : 0xC, 0 & 4) Zcomp_offset=VIDEOREG(NV_PFB_ZCOMP_OFFSET); VIDEOREG(NV_PGRAPH_ZCOMP_OFFSET_XBOX)=Zcomp_offset; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((0x0C)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); VIDEOREG(NV_PGRAPH_RDI_DATA)=Zcomp_offset; Config0=VIDEOREG(NV_PFB_CFG0); VIDEOREG(NV_PGRAPH_CFG0_XBOX)=Config0; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((0x00)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); VIDEOREG(NV_PGRAPH_RDI_DATA)=Config0; Config1=VIDEOREG(NV_PFB_CFG1); VIDEOREG(NV_PGRAPH_CFG1_XBOX)=Config1; VIDEOREG(NV_PGRAPH_RDI_INDEX)=((0x04)&NV_PGRAPH_RDI_INDEX_ADDRESS)|((0xEA<<16)&NV_PGRAPH_RDI_INDEX_SELECT); VIDEOREG(NV_PGRAPH_RDI_DATA)=Config1; } static void pb_create_dma_ctx( DWORD ChannelID, DWORD Class, DWORD Base, DWORD Limit, struct s_CtxDma *pDmaObject ) { DWORD Addr; DWORD AddrSpace; DWORD Inst; DWORD dma_flags; Addr=0; AddrSpace=0; if ((Base&0xF0000000)!=0x80000000) { Addr=Base; AddrSpace=ADDR_FBMEM; } else { Addr=Base&0x03FFFFFF; AddrSpace=ADDR_SYSMEM; } Inst=pb_FreeInst; pb_FreeInst+=1; //reserve 1 block (16 bytes) dma_flags=Class; dma_flags|=0x00003000; if (AddrSpace==ADDR_AGPMEM) dma_flags|=0x00030000; if (AddrSpace==ADDR_SYSMEM) dma_flags|=0x00020000; dma_flags|=0x00008000; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x08)=Addr|3; //0x00000003|Addr VIDEOREG(NV_PRAMIN+(Inst<<4)+0x0C)=Addr|3; //0x00000003|Addr VIDEOREG(NV_PRAMIN+(Inst<<4)+0x00)=dma_flags; //0x???sB0cl ???=Addr&0xFFF VIDEOREG(NV_PRAMIN+(Inst<<4)+0x04)=Limit; //0x03FFAFFF (MAXRAM) memset(pDmaObject,0,sizeof(struct s_CtxDma)); pDmaObject->ChannelID=ChannelID; pDmaObject->Inst=Inst; pDmaObject->Class=Class; pDmaObject->isGr=0; } static void pb_bind_channel(struct s_CtxDma *pCtxDmaObject) { DWORD entry; DWORD *p; //entry in hash table entry=(((pCtxDmaObject->ChannelID>>11)^pCtxDmaObject->ChannelID)>>11)^pCtxDmaObject->ChannelID; //entry*8 max valid value is 0x1000 //points at entry in hash table (table element size is 8 bytes = 2 dwords) p=(DWORD *)(VIDEO_BASE+pb_FifoHTAddr+entry*8); *(p+0)= pCtxDmaObject->ChannelID; *(p+1)= (0x80000000)| (pb_FifoChannelID<<24)| (pCtxDmaObject->isGr<<16)| (pCtxDmaObject->Inst&0xFFFF); } static void pb_3D_init(void) { DWORD Inst; int channel; int i; DWORD offset; DWORD offset_cmn; DWORD offset_pipe; DWORD offset_4dwords; DWORD offset_20dwords; //Initialization of 3 big structures in PRAMIN area //At offset 0x0000 size=0x231C bytes=0x1A9C+0x0880 //At offset 0x231C size=0x0C00 bytes //At offset 0x2F1C size=0x0784 bytes //Padding 4 dwords (at offset 0x36A0 size=0x0010 bytes?) channel=pb_FifoChannelID; Inst=pb_GrCtxInst[channel]; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x000)|=1; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x33C)=0xFFFF0000; for(i=0x340;i<=0x39C;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x3A0)=0x0FFF0000; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x3A4)=0x0FFF0000; for(i=0x3A8;i<=0x478;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x47C)=0x00000101; for(i=0x480;i<=0x48C;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x490)=0x00000111; for(i=0x494;i<=0x4A4;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x4A8)=0x44400000; for(i=0x4AC;i<=0x4D0;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; for(i=0x4D4;i<=0x4E0;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0x00030303; for(i=0x4E4;i<=0x4F0;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; for(i=0x4F4;i<=0x500;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0x00080000; for(i=0x504;i<=0x508;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; for(i=0x50C;i<=0x518;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0x01012000; for(i=0x51C;i<=0x528;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0x000105B8; for(i=0x52C;i<=0x538;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0x00080008; for(i=0x53C;i<=0x558;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; for(i=0x55C;i<=0x578;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0x07FF0000; //8 dwords for(i=0x57C;i<=0x598;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0x07FF0000; //8 dwords for(i=0x59C;i<=0x5A0;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x5A4)=0x4B7FFFFF; for(i=0x5A8;i<=0x5F8;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x5FC)=0x00000001; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x600)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x604)=0x00004000; for(i=0x608;i<=0x60C;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x610)=0x00000001; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x614)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x618)=0x00040000; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x61C)=0x00010000; for(i=0x620;i<=0x628;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; for(i=0x62C;i<=0x6B4;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //35 dwords for(i=0x6B8;i<=0x728;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //26 dwords for(i=0x72C;i<=0x79C;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //26 dwords for(i=0x7A0;i<=0x810;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //26 dwords for(i=0x814;i<=0x818;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //2 dwords for(i=0x81C;i<=0xA18;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //128 dwords for(i=0xA1C;i<=0xC18;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //128 dwords for(i=0xC1C;i<=0xE18;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //128 dwords for(i=0xE1C;i<=0x1018;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //128 dwords for(i=0x101C;i<=0x1318;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //192 dwords for(i=0x131C;i<=0x1A98;i+=4) VIDEOREG(NV_PRAMIN+(Inst<<4)+i)=0; //224 dwords offset=0x1A9C/4; //number of dwords initialized so far = 0x6A7 for(i=0;i<0x88;i++) //136 blocks (unit=16 bytes=4 dwords) { VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*16+0x00)=0x10700FF9; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*16+0x04)=0x0436086C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*16+0x08)=0x000C001B; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*16+0x0C)=0; offset+=4; } offset_cmn=offset; //0x231C/4 for(i=0;i<0x300;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//768 dwords offset+=0x300; //0xC00 bytes offset_pipe=offset; //0x2F1C/4 for(i=0;i<0x68;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//104 dwords offset+=0x68; for(i=0;i<0xD0;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//208 dwords offset+=0xD0; offset_4dwords=offset; for(i=0;i<0x04;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//004 dwords offset+=0x04; offset_20dwords=offset; for(i=0;i<0x14;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//020 dwords offset+=0x14; for(i=0;i<0x0F;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//015 dwords offset+=0x0F; for(i=0;i<0x0E;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//014 dwords offset+=0x0E; for(i=0;i<0x44;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//068 dwords offset+=0x44; for(i=0;i<0x20;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//032 dwords offset+=0x20; for(i=0;i<0x0F;i++) VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4+i*4)=0;//015 dwords offset+=0x0F; //total: +0x1E0 //theoretically, offset=0x369C/4=0xDA7 VIDEOREG(NV_PRAMIN+(Inst<<4)+offset*4)=0; offset++; //total: +0x1E1 //theoretically, offset=0x36A0/4=0xDA8 //Padding : 4 dwords? //total: +0x1E5 //theoretically, offset=0x36B0/4=0xDAC #ifdef DBG if (offset+4!=0x36B0/4) debugPrint("pb_3D_init: bad final value for offset\n"); #endif //floating point post-initializations in cmn structure VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x380)=0x3F800000; //1.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x384)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x388)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x38C)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3C0)=0x40000000; //2.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3C4)=0x3F800000; //1.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3C8)=0x3F000000; //0.5f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3CC)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3D0)=0x40000000; //2.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3D4)=0x3F800000; //1.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3D8)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3DC)=0xBF800000; //-1.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3E0)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3E4)=0xBF800000; //-1.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3E8)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3EC)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x390)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x394)=0x3F800000; //1.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x398)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x39C)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3F0)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3F4)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3F8)=0x00000000; //0.0f VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_cmn*4+0x3FC)=0x00000000; //0.0f //post-initializations in pipe structure VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x160)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x164)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x168)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x16C)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x100)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x104)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x108)=0x000FE000; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x10C)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x110)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x114)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x118)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x11C)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x130)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x134)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x138)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x13C)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x180)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x184)=0x000003F8; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x188)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_pipe*4+0x18C)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_4dwords*4)=0x002FE000; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x010)=0x001C527C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x014)=0x001C527C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x018)=0x001C527C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x01C)=0x001C527C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x020)=0x001C527C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x024)=0x001C527C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x028)=0x001C527C; VIDEOREG(NV_PRAMIN+(Inst<<4)+offset_20dwords*4+0x02C)=0x001C527C; #ifdef DBG //at this point pb_GrCtxID and pb_FifoChannelID must be different //debugPrint("pb_3D_init: gr=%d fifo=%d\n",pb_GrCtxID,pb_FifoChannelID); #endif } static void pb_create_gr_ctx( int ChannelID, int Class, struct s_CtxDma *pGrObject ) { DWORD flags; DWORD flags3D; int size; DWORD Inst; flags3D=0; if ( (Class!=GR_CLASS_30)&& (Class!=GR_CLASS_39)&& (Class!=GR_CLASS_62)&& (Class!=GR_CLASS_97)&& (Class!=GR_CLASS_9F) ) { //"CreateGrObject invalid class number" size=Class; } else { size=16; //16 bytes if (Class==GR_CLASS_97) { size=0x330; //816 bytes flags3D=1; } } Inst=pb_FreeInst; pb_FreeInst+=(size>>4); if (flags3D) { pb_3DGrCtxInst[pb_FifoChannelID]=Inst; pb_3D_init(); } flags=Class&0x000000FF; flags3D=0x00000000; if (Class==GR_CLASS_39) flags|=0x01000000; if (Class==GR_CLASS_97) flags3D=0x00000A00; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x00)=flags; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x04)=flags3D; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x08)=0; VIDEOREG(NV_PRAMIN+(Inst<<4)+0x0C)=0; memset(pGrObject,0,sizeof(struct s_CtxDma)); pGrObject->ChannelID=ChannelID; pGrObject->Class=Class; pGrObject->isGr=1; pGrObject->Inst=Inst; } static void pb_start(void) { if (pb_disable_gpu==0) //do we really want to send data to GPU? { //asks push buffer Dma engine to detect incoming Dma data (written at pb_Put) pb_cache_flush(); *(pb_DmaUserAddr+0x40/4)=((DWORD)pb_Put)&0x03FFFFFF; //from now any write will be detected #ifdef DBG if ((*(pb_DmaUserAddr+0x44/4))>0x04000000) { debugPrint("pb_start: wrong GetAddr\n"); return; } #endif } } static void pb_jump_to_head(void) { //Have Dma engine pointer point at push buffer head again. //(so we don't run into the tail of push buffer) //The best method would be to call this once per frame since it costs time. //Of course, avoid writing more data than push buffer size in 1 frame time. //If it happens you will get a message suggesting to call pb_reset more often //or to enlarge push buffer (with pb_size, before calling pb_init). //Default size is 512Kb (128*1024 dwords) DWORD *pGetAddr; DWORD TimeStampTicks; #ifdef DBG if (pb_BeginEndPair) { debugPrint("pb_reset musn't be called inside a begin-end block.\n"); return; } #endif //writes a jump command //forces GPU to jump at push buffer head address at next fetch *(pb_Put+0)=1+(((DWORD)pb_Head)&0x0FFFFFFF); pb_Put=pb_Head; pb_start(); TimeStampTicks=KeTickCount; //wait for arrival of Gpu Get to push buffer head do { if ((*(pb_DmaUserAddr+0x44/4))>0x04000000) { #ifdef DBG debugPrint("pb_reset: bad getaddr\n"); #endif return; } if (KeTickCount-TimeStampTicks>TICKSTIMEOUT) { debugPrint("pb_reset: too long\n"); break; } //converts physical address into virtual address pGetAddr=(DWORD *)((*(pb_DmaUserAddr+0x44/4))|0x80000000); }while (pGetAddr!=pb_Head); } //public functions int pb_busy(void) { DWORD PutAddr; DWORD GetAddr; GetAddr=*(pb_DmaUserAddr+0x44/4); #ifdef DBG if (GetAddr>0x04000000) { debugPrint("pb_busy: wrong GetAddr\n"); return 0; } #endif PutAddr=(DWORD)pb_Put; if ((GetAddr^PutAddr)&0x0FFFFFFF) return 1; //means different addresses if (VIDEOREG(NV_PGRAPH_STATUS)) return 1; return 0; } DWORD pb_back_buffer_width(void) { return pb_FrameBuffersWidth; } DWORD pb_back_buffer_height(void) { return pb_FrameBuffersHeight; } DWORD pb_back_buffer_pitch(void) { return pb_FrameBuffersPitch; } DWORD *pb_back_buffer(void) { return (DWORD *)pb_FBAddr[pb_back_index]; } DWORD *pb_extra_buffer(int index_buffer) { if (index_buffer>pb_ExtraBuffersCount) { debugPrint("pb_target_extra_buffer: buffer index out of range\n"); return pb_back_buffer(); } return (DWORD *)pb_EXAddr[index_buffer]; } void pb_target_back_buffer(void) { DWORD *p; DWORD width; DWORD height; DWORD pitch; DWORD pitch_depth_stencil; DWORD dma_flags; DWORD dma_addr; DWORD dma_limit; int flag; int depth_stencil; width=pb_FrameBuffersWidth; height=pb_FrameBuffersHeight; pitch=pb_FrameBuffersPitch; pitch_depth_stencil=pb_DepthStencilPitch; //DMA channel 9 is used by GPU in order to render pixels dma_addr=pb_FBAddr[pb_back_index]&0x03FFFFFF; dma_limit=height*pitch-1; //(last byte) dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; p=pb_begin(); pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x08,dma_addr); p+=3; //set params addr,data pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(addr)=data pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x0C,dma_addr); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x00,dma_flags); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x04,dma_limit); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT3,9); p+=2; pb_end(p); //DMA channel 11 is used by GPU in order to bitblt images dma_addr=pb_FBAddr[pb_back_index]&0x03FFFFFF; dma_limit=height*pitch-1; //(last byte) dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; p=pb_begin(); pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x08,dma_addr); p+=3; //set params addr,data pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(addr)=data pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x0C,dma_addr); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x00,dma_flags); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x04,dma_limit); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push1to(SUBCH_4,p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT2,11); p+=2; pb_end(p); depth_stencil=1; if (depth_stencil!=-1) //don't care if (pb_DepthStencilLast!=depth_stencil) //changed? { //DMA channel 10 is used by GPU in order to render depth stencil if (depth_stencil) { dma_addr=pb_DSAddr&0x03FFFFFF; dma_limit=height*pitch_depth_stencil-1; //(last byte) dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; flag=1; } else { dma_addr=0; dma_limit=0; dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; flag=0; pitch_depth_stencil=pitch; } p=pb_begin(); pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x08,dma_addr); p+=3; //set params addr,data pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(addr)=data pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x0C,dma_addr); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x00,dma_flags); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x04,dma_limit); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT4,10); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_DEPTH_TEST_ENABLE,flag); p+=2; //ZEnable=TRUE or FALSE (But don't use W, see below) pb_push1(p,NV20_TCL_PRIMITIVE_3D_STENCIL_TEST_ENABLE,1); p+=2; //StencilEnable=TRUE pb_end(p); pb_DepthStencilLast=depth_stencil; } p=pb_begin(); pb_push3(p,NV20_TCL_PRIMITIVE_3D_BUFFER_PITCH,(pitch_depth_stencil<<16)|(pitch&0xFFFF),0,0); p+=4; pb_push2(p,NV20_TCL_PRIMITIVE_3D_VIEWPORT_HORIZ,width<<16,height<<16); p+=3; //Default (0x00100001) //We use W (0x00010000) //We don't enable YUV (0x10000000) //We don't use floating point depth (0x00001000) pb_push1(p,NV20_TCL_PRIMITIVE_3D_W_YUV_FPZ_FLAGS,0x00110001); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_BUFFER_FORMAT,pb_GPUFrameBuffersFormat|pb_FBVFlag); p+=2; pb_end(p); } void pb_target_extra_buffer(int index_buffer) { DWORD *p; DWORD width; DWORD height; DWORD pitch; DWORD pitch_depth_stencil; DWORD dma_flags; DWORD dma_addr; DWORD dma_limit; int flag; int depth_stencil; if (index_buffer>=pb_ExtraBuffersCount) { debugPrint("pb_target_extra_buffer: buffer index out of range\n"); return; } width=pb_FrameBuffersWidth; height=pb_FrameBuffersHeight; pitch=pb_FrameBuffersPitch; pitch_depth_stencil=pb_DepthStencilPitch; //DMA channel 9 is used by GPU in order to render pixels dma_addr=pb_EXAddr[index_buffer]&0x03FFFFFF; dma_limit=height*pitch-1; //(last byte) dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; p=pb_begin(); pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x08,dma_addr); p+=3; //set params addr,data pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(addr)=data pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x0C,dma_addr); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x00,dma_flags); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID9Inst<<4)+0x04,dma_limit); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT3,9); p+=2; pb_end(p); //DMA channel 11 is used by GPU in order to bitblt images dma_addr=pb_EXAddr[index_buffer]&0x03FFFFFF; dma_limit=height*pitch-1; //(last byte) dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; p=pb_begin(); pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x08,dma_addr); p+=3; //set params addr,data pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(addr)=data pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x0C,dma_addr); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x00,dma_flags); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID11Inst<<4)+0x04,dma_limit); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push1to(SUBCH_4,p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT2,11); p+=2; pb_end(p); depth_stencil=1; if (depth_stencil!=-1) //don't care if (pb_DepthStencilLast!=depth_stencil) //changed? { //DMA channel 10 is used by GPU in order to render depth stencil if (depth_stencil) { dma_addr=pb_DSAddr&0x03FFFFFF; dma_limit=height*pitch_depth_stencil-1; //(last byte) dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; flag=1; } else { dma_addr=0; dma_limit=0; dma_flags=DMA_CLASS_3D|0x0000B000; dma_addr|=3; flag=0; pitch_depth_stencil=pitch; } p=pb_begin(); pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x08,dma_addr); p+=3; //set params addr,data pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(addr)=data pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x0C,dma_addr); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x00,dma_flags); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PRAMIN+(pb_DmaChID10Inst<<4)+0x04,dma_limit); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT4,10); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_DEPTH_TEST_ENABLE,flag); p+=2; //ZEnable=TRUE or FALSE (But don't use W, see below) pb_push1(p,NV20_TCL_PRIMITIVE_3D_STENCIL_TEST_ENABLE,1); p+=2; //StencilEnable=TRUE pb_end(p); pb_DepthStencilLast=depth_stencil; } p=pb_begin(); pb_push3(p,NV20_TCL_PRIMITIVE_3D_BUFFER_PITCH,(pitch_depth_stencil<<16)|(pitch&0xFFFF),0,0); p+=4; pb_push2(p,NV20_TCL_PRIMITIVE_3D_VIEWPORT_HORIZ,width<<16,height<<16); p+=3; //Default (0x00100001) //We use W (0x00010000) //We don't enable YUV (0x10000000) //We don't use floating point depth (0x00001000) pb_push1(p,NV20_TCL_PRIMITIVE_3D_W_YUV_FPZ_FLAGS,0x00110001); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_BUFFER_FORMAT,pb_GPUFrameBuffersFormat|pb_FBVFlag); p+=2; pb_end(p); } DWORD pb_get_vbl_counter(void) { return pb_vbl_counter; //allows caller to know if a frame has been missed } DWORD pb_wait_for_vbl(void) { NtWaitForSingleObject(pb_VBlankEvent, FALSE, NULL); return pb_vbl_counter; //allows caller to know if a frame has been missed } void pb_print(char *format, ...) { char buffer[512]; int i; va_list argList; va_start(argList, format); vsprintf(buffer, format, argList); va_end(argList); for(i=0;i=0)&&(row=0)&&(col>=1) if (systemFont[c*8+l]&m) { if (x1>=0) x2=20+j*10+k; else x1=20+j*10+k; } else { if (x2>=0) { y=25+i*25+l*2; pb_fill(x1,y,x2-x1+1,2,0xFFFFFF); x1=x2=-1; } else if (x1>=0) { y=25+i*25+l*2; pb_fill(x1,y,1,2,0xFFFFFF); x1=-1; } } } } } void pb_extra_buffers(int n) { if (n>MAX_EXTRA_BUFFERS) debugPrint("Too many extra buffers\n"); else pb_ExtraBuffersCount=n; } void pb_size(DWORD size) { if (pb_running) debugPrint("Can't set size while push buffer Dma engine is running.\n"); else { if (size<64*1024) debugPrint("Push buffer size must be equal or larger than 64Kb.\n"); else if ((size-1)&size) debugPrint("Push buffer size must be a power of 2.\n"); else pb_Size=size; } } void pb_reset(void) { pb_jump_to_head(); } DWORD *pb_begin(void) { #ifdef DBG if (pb_Put>=pb_Tail) debugPrint("ERROR! Push buffer overflow! Use pb_reset more often or enlarge push buffer!\n"); if (pb_BeginEndPair==1) debugPrint("pb_start without a pb_end earlier\n"); pb_BeginEndPair=1; pb_PushIndex=0; pb_PushNext=pb_Put; pb_PushStart=pb_Put; #endif return pb_Put; } #ifdef LOG static FILE *fd; static int logging=0; void pb_start_log(void) { if (logging) return; logging=1; fd=fopen("pbkit_record.txt","w"); } void pb_stop_log(void) { if (logging==0) return; logging=0; fclose(fd); } #endif void pb_end(DWORD *pEnd) { DWORD TimeStamp1; DWORD TimeStamp2; int i; #ifdef LOG DWORD *p; int n; if (logging) { p=pb_PushStart; while (p!=pEnd) { n=(*p>>18)&0x7FF; fprintf(fd,"0x%08x, ",*(p++)); for(i=0;iTICKSTIMEOUT) { debugPrint("pb_end: Busy for too long (%d) (%08x)\n", ((DWORD)(pb_Put)-(DWORD)(pb_Head)), VIDEOREG(NV_PFIFO_CACHE1_DMA_GET) ); break; } } } } void pb_push1to(DWORD subchannel, DWORD *p, DWORD command, DWORD param1) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push1to: new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push1to: missing pb_begin earlier\n"); pb_PushIndex+=2; pb_PushNext+=2; if (pb_PushIndex>128) debugPrint("pb_push1to: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(subchannel,command,1); *(p+1)=param1; } void pb_push2to(DWORD subchannel, DWORD *p, DWORD command, DWORD param1, DWORD param2) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push2to : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push2to : missing pb_begin earlier\n"); pb_PushIndex+=3; pb_PushNext+=3; if (pb_PushIndex>128) debugPrint("pb_push2to: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(subchannel,command,2); *(p+1)=param1; *(p+2)=param2; } void pb_push3to(DWORD subchannel, DWORD *p, DWORD command, DWORD param1, DWORD param2, DWORD param3) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push3to : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push3to : missing pb_begin earlier\n"); pb_PushIndex+=4; pb_PushNext+=4; if (pb_PushIndex>128) debugPrint("pb_push3to: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(subchannel,command,3); *(p+1)=param1; *(p+2)=param2; *(p+3)=param3; } void pb_push4to(DWORD subchannel, DWORD *p, DWORD command, DWORD param1, DWORD param2, DWORD param3, DWORD param4) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push4to : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push4to : missing pb_begin earlier\n"); pb_PushIndex+=5; pb_PushNext+=5; if (pb_PushIndex>128) debugPrint("pb_push4to: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(subchannel,command,4); *(p+1)=param1; *(p+2)=param2; *(p+3)=param3; *(p+4)=param4; } void pb_push1(DWORD *p, DWORD command, DWORD param1) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push1: new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push1: missing pb_begin earlier\n"); pb_PushIndex+=2; pb_PushNext+=2; if (pb_PushIndex>128) debugPrint("pb_push1: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(SUBCH_3D,command,1); *(p+1)=param1; } void pb_push2(DWORD *p, DWORD command, DWORD param1, DWORD param2) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push2 : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push2 : missing pb_begin earlier\n"); pb_PushIndex+=3; pb_PushNext+=3; if (pb_PushIndex>128) debugPrint("pb_push2: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(SUBCH_3D,command,2); *(p+1)=param1; *(p+2)=param2; } void pb_push3(DWORD *p, DWORD command, DWORD param1, DWORD param2, DWORD param3) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push3 : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push3 : missing pb_begin earlier\n"); pb_PushIndex+=4; pb_PushNext+=4; if (pb_PushIndex>128) debugPrint("pb_push3: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(SUBCH_3D,command,3); *(p+1)=param1; *(p+2)=param2; *(p+3)=param3; } void pb_push4(DWORD *p, DWORD command, DWORD param1, DWORD param2, DWORD param3, DWORD param4) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push4 : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push4 : missing pb_begin earlier\n"); pb_PushIndex+=5; pb_PushNext+=5; if (pb_PushIndex>128) debugPrint("pb_push4: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(SUBCH_3D,command,4); *(p+1)=param1; *(p+2)=param2; *(p+3)=param3; *(p+4)=param4; } void pb_push4f(DWORD *p, DWORD command, float param1, float param2, float param3, float param4) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push4f : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push4f : missing pb_begin earlier\n"); pb_PushIndex+=5; pb_PushNext+=5; if (pb_PushIndex>128) debugPrint("pb_push4f: begin-end block musn't exceed 128 dwords\n"); #endif *(p+0)=EncodeMethod(SUBCH_3D,command,4); *((float *)(p+1))=param1; *((float *)(p+2))=param2; *((float *)(p+3))=param3; *((float *)(p+4))=param4; } void pb_push_transposed_matrix(DWORD *p, DWORD command, float *m) { #ifdef DBG if (p!=pb_PushNext) debugPrint("pb_push_transposed_matrix : new write address invalid or not following previous write addresses\n"); if (pb_BeginEndPair==0) debugPrint("pb_push_transposed_matrix : missing pb_begin earlier\n"); pb_PushIndex+=17; pb_PushNext+=17; if (pb_PushIndex>128) debugPrint("pb_push_transposed_matrix : begin-end block musn't exceed 128 dwords\n"); #endif *(p++)=EncodeMethod(SUBCH_3D,command,16); *((float *)p++)=m[_11]; *((float *)p++)=m[_21]; *((float *)p++)=m[_31]; *((float *)p++)=m[_41]; *((float *)p++)=m[_12]; *((float *)p++)=m[_22]; *((float *)p++)=m[_32]; *((float *)p++)=m[_42]; *((float *)p++)=m[_13]; *((float *)p++)=m[_23]; *((float *)p++)=m[_33]; *((float *)p++)=m[_43]; *((float *)p++)=m[_14]; *((float *)p++)=m[_24]; *((float *)p++)=m[_34]; *((float *)p++)=m[_44]; } void pb_show_front_screen(void) { VIDEOREG(PCRTC_START)=pb_FBAddr[pb_front_index]&0x03FFFFFF; pb_debug_screen_active=0; } void pb_show_debug_screen(void) { VIDEOREG(PCRTC_START)=((DWORD)XVideoGetFB())&0x0FFFFFFF; pb_debug_screen_active=1; } void pb_show_depth_screen(void) { VIDEOREG(PCRTC_START)=pb_DSAddr&0x0FFFFFFF; pb_debug_screen_active=1; } void pb_set_viewport(int dwx,int dwy,int width,int height,float zmin,float zmax) { DWORD *p; DWORD dwzminscaled; DWORD dwzmaxscaled; float x,y,w,h; if (dwx<0) dwx=0; if (dwy<0) dwy=0; if (dwx+width>pb_FrameBuffersWidth) width=pb_FrameBuffersWidth-dwx; if (dwy+height>pb_FrameBuffersHeight) height=pb_FrameBuffersHeight-dwy; pb_Viewport_x=dwx; pb_Viewport_y=dwy; pb_Viewport_width=width; pb_Viewport_height=height; pb_Viewport_zmin=zmin; pb_Viewport_zmax=zmax; x=0.53125f+(float)dwx; y=0.53125f+(float)dwy; w=0.5f*((float)pb_Viewport_width); h=-0.5f*((float)pb_Viewport_height); *((float *)&dwzminscaled)=zmin*pb_ZScale; *((float *)&dwzmaxscaled)=zmax*pb_ZScale; /* p=pb_begin(); pb_push4f(p,NV20_TCL_PRIMITIVE_3D_VIEWPORT_OX,x+0.53125f,y+0.53125f,0.0f,0.0f); p+=5; pb_push2(p,NV20_TCL_PRIMITIVE_3D_DEPTH_RANGE_NEAR,dwzminscaled,dwzmaxscaled); p+=3; pb_end(p); */ p=pb_begin(); pb_push4f(p,NV20_TCL_PRIMITIVE_3D_VIEWPORT_OX,x+w,y-h,zmin*pb_ZScale,0.0f); p+=5; pb_push4f(p,NV20_TCL_PRIMITIVE_3D_VIEWPORT_PX_DIV2,w,h,(zmax-zmin)*pb_ZScale,0.0f); p+=5; pb_push2(p,NV20_TCL_PRIMITIVE_3D_DEPTH_RANGE_NEAR,dwzminscaled,dwzmaxscaled); p+=3; pb_end(p); } void pb_fill(int x, int y, int w, int h, DWORD color) { DWORD *p; int x1,y1,x2,y2; x1=x; y1=y; x2=x+w; y2=y+h; //if you supply 32 bits color and res is 16 bits, apply function below //color=((color>>8)&0xF800)|((color>>5)&0x07E0)|((color>>3)&0x001F); p=pb_begin(); pb_push(p++,NV20_TCL_PRIMITIVE_3D_CLEAR_VALUE_HORIZ,2); //sets rectangle coordinates *(p++)=((x2-1)<<16)|x1; *(p++)=((y2-1)<<16)|y1; pb_push(p++,NV20_TCL_PRIMITIVE_3D_CLEAR_VALUE_DEPTH,3); //sets data used to fill in rectangle *(p++)=0; //(depth<<8)|stencil *(p++)=color; //color *(p++)=0xF0; //triggers the HW rectangle fill (0x03 for D&S) pb_end(p); } //ALWAYS use this at beginning of frame or you may lose one third of performance because //automatic compression algorithm for tile #1 can't afford any garbage left behind... //Also, try to draw from closest distance to farest distance to help algorithm //Depth is set to max and stencil is set to 0. We assume D24S8 format is used. //Implies that depth test function is set to "less or equal" void pb_erase_depth_stencil_buffer(int x, int y, int w, int h) { DWORD *p; int x1,y1,x2,y2; x1=x; y1=y; x2=x+w; y2=y+h; p=pb_begin(); pb_push(p++,NV20_TCL_PRIMITIVE_3D_CLEAR_VALUE_HORIZ,2); //sets rectangle coordinates *(p++)=((x2-1)<<16)|x1; *(p++)=((y2-1)<<16)|y1; pb_push(p++,NV20_TCL_PRIMITIVE_3D_CLEAR_VALUE_DEPTH,3); //sets data used to fill in rectangle *(p++)=0xffffff00; //(depth<<8)|stencil *(p++)=0; //color *(p++)=0x03; //triggers the HW rectangle fill (only on D&S) pb_end(p); } //returns 1 if we have to retry later (means no free buffer, draw more details next time) int pb_finished(void) { DWORD *p; if (pb_BackBufferbReady[pb_BackBufferNxt]) return 1; //table is full, retry later //insert in push buffer the commands to trigger screen swapping at next VBlank p=pb_begin(); pb_push1(p,NV20_TCL_PRIMITIVE_3D_ASK_FOR_IDLE,0); p+=2; //ask for idle pb_push1(p,NV20_TCL_PRIMITIVE_3D_NOP,0); p+=2; //wait for idle pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; //wait/makespace (obtains null status) pb_push1(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,pb_back_index); p+=2; //set param=back buffer index to show up pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_FINISHED); p+=2; //subprogID PB_FINISHED: gets frame ready to show up soon // pb_push1(p,NV20_TCL_PRIMITIVE_3D_STALL_PIPELINE,0); p+=2; //stall gpu pipeline (not sure it's needed in triple buffering technic) pb_end(p); //insert in push buffer the commands to trigger selection of next back buffer //(because previous ones may not have finished yet, so need to use 0x0100 call) pb_back_index=(pb_back_index+1)%3; pb_target_back_buffer(); return 0; } void pb_kill(void) { void *pSavedData; int i; DWORD old_caches,old_push,old_pull; DWORD *p; DWORD TimeStampTicks; int counter; #ifdef DBG // debugPrint("Waiting until Dma is not busy\n"); #endif if (pb_Put) { pb_start(); pb_wait_until_gr_not_busy(); *(pb_Put)=(((DWORD)pb_Head)&0x0FFFFFFF)+1; //writes a jump to push buffer head pb_Put=pb_Head; pb_start(); TimeStampTicks=KeTickCount; while(1) { if ((*(pb_DmaUserAddr+0x44/4))>0x04000000) { debugPrint("pb_kill: Bad get addr\n"); break; } //did GetAddr reach push buffer head as planned? if (((*(pb_DmaUserAddr+0x44/4))&0x0FFFFFFF)==(((DWORD)pb_Head)&0x0FFFFFFF)) break; if (KeTickCount-TimeStampTicks>TICKSTIMEOUT) { debugPrint("pb_kill: Dma busy for too long\n"); break; } } } #ifdef DBG // if (KeTickCount-TimeStampTicks<=TICKSTIMEOUT) debugPrint("Dma not busy. All is ok.\n"); #endif //wait until screen swapping is finished (if one is on its way) while(pb_BackBufferbReady[pb_BackBufferNxt]); pb_running=0; if (pb_ExtraBuffersCount) MmFreeContiguousMemory((PVOID)pb_EXAddr[0]); if (pb_DepthStencilAddr) MmFreeContiguousMemory((PVOID)pb_DepthStencilAddr); if (pb_FrameBuffersAddr) MmFreeContiguousMemory((PVOID)pb_FrameBuffersAddr); if (pb_DmaBuffer8) MmFreeContiguousMemory(pb_DmaBuffer8); if (pb_DmaBuffer2) MmFreeContiguousMemory(pb_DmaBuffer2); if (pb_DmaBuffer7) MmFreeContiguousMemory(pb_DmaBuffer7); if (pb_Head) MmFreeContiguousMemory(pb_Head); //eventually restore a previously saved video mode pSavedData=AvGetSavedDataAddress(); if (pSavedData==0) AvSendTVEncoderOption((PVOID)VIDEO_BASE,VIDEO_ENC_VIDEOENABLE,1,NULL); //restore system completely for(i=0;i<8;i++) pb_release_tile(i,1); VIDEOREG(NV_PFIFO_DMA_TIMESLICE)=NV_PFIFO_DMA_TIMESLICE_ALL_DISABLE; while ( ((VIDEOREG8(NV_PFIFO_CACHE1_STATUS)&NV_PFIFO_CACHE1_STATUS_LOW_MARK_EMPTY)==0)|| ((VIDEOREG8(NV_PFIFO_RUNOUT_STATUS)&NV_PFIFO_RUNOUT_STATUS_LOW_MARK_EMPTY)==0)|| ((VIDEOREG8(NV_PFIFO_CACHE1_DMA_PUSH)&NV_PFIFO_CACHE1_DMA_PUSH_STATE_BUSY)!=0) ) { pb_fifo_handler(); if (VIDEOREG(NV_PGRAPH_INTR)!=NV_PGRAPH_INTR_NOT_PENDING) pb_gr_handler(); if (VIDEOREG(NV_PMC_INTR_0)&NV_PMC_INTR_0_PCRTC_PENDING) pb_vbl_handler(); } VIDEOREG(NV_PFIFO_CACHE1_DMA_PUSH)=NV_PFIFO_CACHE1_DMA_PUSH_ACCESS_DISABLE; while((VIDEOREG8(NV_PFIFO_CACHE1_DMA_PUSH)&NV_PFIFO_CACHE1_DMA_PUSH_STATE_BUSY)!=0); VIDEOREG(NV_PFIFO_CACHES)=NV_PFIFO_CACHES_ALL_DISABLE; VIDEOREG(NV_PFIFO_CACHE0_PUSH0)=NV_PFIFO_CACHE0_PUSH0_ACCESS_DISABLE; VIDEOREG(NV_PFIFO_CACHE0_PULL0)=NV_PFIFO_CACHE0_PULL0_ACCESS_DISABLE; VIDEOREG(NV_PFIFO_CACHE1_PUSH0)=NV_PFIFO_CACHE1_PUSH0_ACCESS_DISABLE; VIDEOREG(NV_PFIFO_CACHE1_PULL0)=NV_PFIFO_CACHE1_PULL0_ACCESS_DISABLE; pb_set_fifo_channel(1); VIDEOREG(NV_PFIFO_CACHE1_PUT)=0; VIDEOREG(NV_PFIFO_CACHE1_GET)=0; old_caches=VIDEOREG(NV_PFIFO_CACHES); old_push=VIDEOREG(NV_PFIFO_CACHE1_PUSH0); old_pull=VIDEOREG(NV_PFIFO_CACHE1_PULL0); VIDEOREG(NV_PFIFO_CACHES)=NV_PFIFO_CACHES_ALL_DISABLE; VIDEOREG(NV_PFIFO_CACHE1_PUSH0)=NV_PFIFO_CACHE1_PUSH0_ACCESS_DISABLE; VIDEOREG(NV_PFIFO_CACHE1_PULL0)=NV_PFIFO_CACHE1_PULL0_ACCESS_DISABLE; //Neutralize DMA (for channels 0 and 1) for(i=0;i<2;i++) { if (pb_FifoChannelsReady) //any active channel? { p=(DWORD *)(VIDEO_BASE+pb_FifoFCAddr+i*64); *(p+1)=*(p+0); //DMA_GET=DMA_PUT *(p+4)=0; //DMA_STATE=0 } } VIDEOREG(NV_PFIFO_CACHE1_PULL0)=old_pull; VIDEOREG(NV_PFIFO_CACHE1_PUSH0)=old_push; VIDEOREG(NV_PFIFO_CACHES)=old_caches; VIDEOREG(NV_PFIFO_DMA)=NV_PFIFO_DMA_NOT_PENDING; VIDEOREG(NV_PFIFO_INTR_EN_0)=NV_PFIFO_INTR_EN_0_ALL_DISABLE; pb_load_gr_ctx(NONE); //restore most essential outer registers VIDEOREG(NV_PFB_CFG0)=pb_OldFBConfig0; VIDEOREG(NV_PFB_CFG1)=pb_OldFBConfig1; VIDEOREG(NV_PMC_ENABLE)=pb_OldMCEnable; VIDEOREG(NV_PMC_INTR_EN_0)=pb_OldMCInterrupt; VIDEOREG(PCRTC_START)=pb_OldVideoStart; pb_uninstall_gpu_interrupt(); NtClose(pb_VBlankEvent); } int pb_init(void) { DWORD old; DWORD mdiv,ndiv,odiv,pdiv,result; BYTE old_color_31; BYTE old_color_82; DWORD baseaddr,baseaddr2; int i,j,k; DWORD *p; struct s_CtxDma sDmaObject2; struct s_CtxDma sDmaObject3; struct s_CtxDma sDmaObject4; struct s_CtxDma sDmaObject5; struct s_CtxDma sDmaObject6; struct s_CtxDma sDmaObject7; struct s_CtxDma sDmaObject8; struct s_CtxDma sDmaObject9; struct s_CtxDma sDmaObject10; struct s_CtxDma sDmaObject11; struct s_CtxDma sDmaObject12; struct s_CtxDma sGrObject13; struct s_CtxDma sGrObject14; struct s_CtxDma sGrObject16; struct s_CtxDma sGrObject17; DWORD UserAddr; DWORD TimeStamp1; DWORD TimeStamp2; DWORD GetAddr; DWORD PutAddr; //Dma channel properties int dma_trig=128; //min 8 max 256 int dma_size=128; //min 32 max 256 int dma_max_reqs=8; //min 0 max 15 DWORD dummy; DWORD channel; DWORD *pGrCtxTable; VIDEO_MODE vm; DWORD format; DWORD BackBufferCount; DWORD BackBufferFormat; DWORD DepthStencilFormat; DWORD Width; DWORD Height; DWORD FrameBufferCount; DWORD HScale; DWORD VScale; DWORD HSize; DWORD VSize; DWORD Pitch; DWORD Addr; DWORD Size; DWORD FBAddr; DWORD FBSize; DWORD DSAddr; DWORD DSSize; DWORD EXAddr; DWORD EXSize; int n; DWORD value; if (pb_running) return -8; //reset global vars (except pb_Size) pb_3DGrCtxInst[0]=0; pb_3DGrCtxInst[1]=0; pb_FifoChannelsReady=0; pb_FifoChannelsMode=NV_PFIFO_MODE_ALL_PIO; pb_FifoChannelID=0; pb_GammaRampIdx=0; for(i=0;i<3;i++) pb_GammaRampbReady[i]=0; for(k=0;k<3;k++) for(i=0;i<3;i++) for(j=0;j<256;j++) pb_GammaRamp[k][i][j]=j; pb_BackBufferNxt=0; for(i=0;i<5;i++) pb_BackBufferbReady[i]=0; pb_Put=NULL; pb_PutRunSize=0; pb_FrameBuffersAddr=0; pb_DmaBuffer8=MmAllocateContiguousMemoryEx(32,0,MAXRAM,0,4); pb_DmaBuffer2=MmAllocateContiguousMemoryEx(32,0,MAXRAM,0,4); pb_DmaBuffer7=MmAllocateContiguousMemoryEx(32,0,MAXRAM,0,4); //NumberOfBytes,LowestAcceptableAddress,HighestAcceptableAddress,Alignment,ProtectionType if ((pb_DmaBuffer8==NULL)||(pb_DmaBuffer2==NULL)||(pb_DmaBuffer7==NULL)) return -2; memset(pb_DmaBuffer8,0,32); memset(pb_DmaBuffer2,0,32); memset(pb_DmaBuffer7,0,32); pb_Head=MmAllocateContiguousMemoryEx(pb_Size+8*1024,0,MAXRAM,0,0x404); //NumberOfBytes,LowestAcceptableAddress,HighestAcceptableAddress,Alignment OPTIONAL,ProtectionType if (pb_Head==NULL) return -3; memset(pb_Head,0,pb_Size+8*1024); pb_Tail=pb_Head+pb_Size/4; pb_Put=pb_Head; pb_BackBufferNxt=0; //increments when we finish drawing a frame pb_BackBufferbReady[0]=0; pb_BackBufferbReady[1]=0; pb_BackBufferbReady[2]=0; pb_BackBufferNxtVBL=0; //increments when VBlank event fires //initialize push buffer DMA engine //DMA=Direct Memory Access (means CPU is not involved in the data transfert) NtCreateEvent(&pb_VBlankEvent, NULL, NotificationEvent, FALSE); VIDEOREG(NV_PBUS_PCI_NV_1)|=NV_PBUS_PCI_NV_1_BUS_MASTER_ENABLED; VIDEOREG(PCRTC_INTR_EN)=PCRTC_INTR_EN_VBLANK_DISABLED; VIDEOREG(NV_PTIMER_INTR_EN_0)=NV_PTIMER_INTR_EN_0_ALARM_DISABLED; if (pb_install_gpu_interrupt()==0) { if (pb_DmaBuffer8) MmFreeContiguousMemory(pb_DmaBuffer8); if (pb_DmaBuffer2) MmFreeContiguousMemory(pb_DmaBuffer2); if (pb_DmaBuffer7) MmFreeContiguousMemory(pb_DmaBuffer7); if (pb_Head) MmFreeContiguousMemory(pb_Head); NtClose(pb_VBlankEvent); return -4; //OpenXDK probably hooked IRQ3 already } //backup of the most essential outer registers (pb_kill will restore them) pb_OldMCEnable=VIDEOREG(NV_PMC_ENABLE); pb_OldMCInterrupt=VIDEOREG(NV_PMC_INTR_EN_0); pb_OldFBConfig0=VIDEOREG(NV_PFB_CFG0); pb_OldFBConfig1=VIDEOREG(NV_PFB_CFG1); pb_OldVideoStart=((DWORD)XVideoGetFB())&0x03FFFFFF; VIDEOREG(NV_PBUS_PCI_NV_12)=NV_PBUS_PCI_NV_12_ROM_DECODE_DISABLED; VIDEOREG(NV_PBUS_PCI_NV_3)=NV_PBUS_PCI_NV_3_LATENCY_TIMER_248_CLOCKS; VIDEOREG(NV_PMC_ENABLE)=NV_PMC_ENABLE_ALL_ENABLE; VIDEOREG(NV_PMC_INTR_EN_0)=NV_PMC_INTR_EN_0_INTA_HARDWARE; mdiv=(VIDEOREG(NV_PRAMDAC_NVPLL_COEFF)&NV_PRAMDAC_NVPLL_COEFF_MDIV); ndiv=(VIDEOREG(NV_PRAMDAC_NVPLL_COEFF)&NV_PRAMDAC_NVPLL_COEFF_NDIV)>>8; odiv=1; pdiv=(VIDEOREG(NV_PRAMDAC_NVPLL_COEFF)&NV_PRAMDAC_NVPLL_COEFF_PDIV)>>16; if (mdiv) { //Xtal in Xbox is at 16.666 Mhz but we want 31.25Mhz for GPU... if (((DW_XTAL_16MHZ*ndiv)/(odiv<>8)&NV_PFIFO_RAMHT_BASE_ADDRESS)|NV_PFIFO_RAMHT_SEARCH_128; // =NV_PFIFO_RAMHT_BASE_ADDRESS_10000 //FC (size 0x80) pb_FifoFCAddr=baseaddr+NV_PRAMIN+0x1000;//=0x11000+NV_PRAMIN //U1 (size 0x20) Unknown1 pb_FifoU1Addr=baseaddr+NV_PRAMIN+0x1080;//=0x11080+NV_PRAMIN //FC (dwFifoFCAddr, but 128 bytes aligned, with flag 0x200) baseaddr2=((pb_FifoFCAddr+0x80)&0x1FC00)|0x200; //0x11200 VIDEOREG(NV_PFIFO_RAMFC)=baseaddr2<<7|((pb_FifoFCAddr>>8)&NV_PFIFO_RAMFC_BASE_ADDRESS); // |NV_PFIFO_RAMFC_BASE_ADDRESS_11000 //=0x00890110 (theoretical value) //=0x008A0110 (current value read under openxdk : |0x400 instead of |0x200) pb_FreeInst=(pb_FifoU1Addr-NV_PRAMIN+0x20)>>4; // =0x110A (unit=16 bytes block) VIDEOREG(NV_PFB_NVM)=VIDEOREG(NV_PFB_NVM)&NV_PFB_NVM_MODE_DISABLE; //zeroes whole GPU instance memory for(i=0;i>8)&0xFF))*(XTAL_16MHZ/((float)(value&0xFF))); else pb_CpuFrequency=733.33f; //Mhz, theoretically pb_create_dma_ctx(3,DMA_CLASS_3D,0,MAXRAM,&sDmaObject3); pb_create_dma_ctx(5,DMA_CLASS_2,0,MAXRAM,&sDmaObject5); pb_create_dma_ctx(4,DMA_CLASS_3,0,MAXRAM,&sDmaObject4); pb_create_dma_ctx(9,DMA_CLASS_3D,0,MAXRAM,&sDmaObject9); pb_create_dma_ctx(10,DMA_CLASS_3D,0,MAXRAM,&sDmaObject10); pb_create_dma_ctx(11,DMA_CLASS_3D,0,MAXRAM,&sDmaObject11); pb_DmaChID9Inst=sDmaObject9.Inst; pb_DmaChID10Inst=sDmaObject10.Inst; pb_DmaChID11Inst=sDmaObject11.Inst; pb_create_dma_ctx(2,DMA_CLASS_3,(DWORD)pb_DmaBuffer2,0x1F,&sDmaObject2); pb_create_dma_ctx(7,DMA_CLASS_3D,(DWORD)pb_DmaBuffer7,0x1F,&sDmaObject7); //this one is damn important. memory address 0x80000000 acts as a trigger. pb_create_dma_ctx(12,DMA_CLASS_3D,0x80000000,0x10000000,&sDmaObject12); pb_create_dma_ctx(8,DMA_CLASS_3D,(DWORD)pb_DmaBuffer8,0x20,&sDmaObject8); pb_create_dma_ctx(6,DMA_CLASS_2,0,MAXRAM,&sDmaObject6); //we initialized channel 0 first, that will match graphic context 0 pb_FifoChannelID=0; pb_FifoChannelsMode=NV_PFIFO_MODE_ALL_PIO; pb_FifoBigInst=pb_FreeInst; pb_FreeInst+=0x37F; //895 blocks=14320 bytes=0x37F0 bytes dummy=VIDEOREG(NV_PFIFO_CACHES); channel=pb_FifoChannelID; VIDEOREG(NV_PFIFO_CACHES)=NV_PFIFO_CACHES_ALL_DISABLE; //zeroes 0x37F0 bytes (0xDFC/4=0x37F blocks, 4 dwords in 1 block) for(i=0;i<0xDFC;i++) VIDEOREG(NV_PRAMIN+(pb_FifoBigInst<<4)+i*4)=0; //here we go, we initialize first graphic context pointer pGrCtxTable=(DWORD *)(VIDEO_BASE+NV_PRAMIN+(pb_GrCtxTableInst<<4)); *(pGrCtxTable+channel)=pb_FifoBigInst; pb_GrCtxInst[channel]=pb_FifoBigInst; //points at channel details in PRAMIN area p=(DWORD *)(VIDEO_BASE+pb_FifoFCAddr+channel*64); //zeroes details for(i=0;i<16;i++) *(p+i)=0; //set dma instance, future value for VIDEOREG(NV_PFIFO_CACHE1_DMA_INSTANCE) *(p+3)=sDmaObject6.Inst; //encode trig & size dma_trig=(dma_trig>>3)-1; dma_size=(dma_size>>5)-1; //set dma fetch, future value for VIDEOREG(NV_PFIFO_CACHE1_DMA_FETCH) *(p+5)= ((dma_trig<<3)&NV_PFIFO_CACHE1_DMA_FETCH_TRIG)| ((dma_size<<13)&NV_PFIFO_CACHE1_DMA_FETCH_SIZE)| ((dma_max_reqs<<16)&NV_PFIFO_CACHE1_DMA_FETCH_MAX_REQS); pb_FifoChannelsMode|=(1<0x04000000) { debugPrint("pb_init: Bad getaddr\n"); pb_kill(); return -9; } PutAddr=((DWORD)pb_Put); if (((GetAddr^PutAddr)&0x0FFFFFFF)==0) break; //means same addresses (Dma is ready) TimeStamp2=KeTickCount; if (TimeStamp2-TimeStamp1>TICKSTIMEOUT) { debugPrint("pb_init: Dma didn't get ready in time\n"); pb_kill(); return -10; } } #ifdef DBG // debugPrint("Dma is ready!!!\n"); #endif *((DWORD *)0x80000000)=0xFFFFFFFF; //Let's start initializing inner GPU registers!!! //These commands assign DMA channels to push buffer subchannels //and associate some specific GPU parts to specific Dma channels p=pb_begin(); pb_push1to(SUBCH_2,p,NV20_TCL_PRIMITIVE_SET_MAIN_OBJECT,14); p+=2; pb_push1to(SUBCH_3,p,NV20_TCL_PRIMITIVE_SET_MAIN_OBJECT,16); p+=2; pb_push1to(SUBCH_4,p,NV20_TCL_PRIMITIVE_SET_MAIN_OBJECT,17); p+=2; pb_push1to(SUBCH_3D,p,NV20_TCL_PRIMITIVE_SET_MAIN_OBJECT,13); p+=2; pb_push1to(SUBCH_2,p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT0,7); p+=2; pb_push1to(SUBCH_3,p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT5,17); p+=2; pb_push1to(SUBCH_3,p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT_UNKNOWN,3); p+=2; pb_push2to(SUBCH_4,p,NV20_TCL_PRIMITIVE_3D_SET_OBJECT1,3,11); p+=3; pb_end(p); //calls pb_start() which will trigger the reading and sending to GPU (asynchronous, no waiting) //setup needed for color computations p=pb_begin(); pb_push(p++,NV20_TCL_PRIMITIVE_3D_SET_OBJECT0,3); *(p++)=2; *(p++)=3; *(p++)=3; pb_push(p++,NV20_TCL_PRIMITIVE_3D_SET_OBJECT2A,6); *(p++)=4; *(p++)=9; *(p++)=10; *(p++)=3; *(p++)=3; *(p++)=8; pb_push(p++,NV20_TCL_PRIMITIVE_3D_SET_OBJECT8,1); *(p++)=12; pb_push(p++,NV20_TCL_PRIMITIVE_3D_ACTIVATE_COLORS,1); *(p++)=0; pb_end(p); p=pb_begin(); pb_push1(p,0x09FC,1); p+=2; pb_push4f(p,0x0A50,0.0f,0.0f,0.0f,1.0f); p+=5; pb_push1(p,NV20_TCL_PRIMITIVE_3D_EDGE_FLAG,1); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_TX_SHADER_PREVIOUS,0x00210000); p+=2; //(PSTextureInput) What previous stage is used at each stage pb_push1(p,0x1D80,1); p+=2; pb_push1(p,0x1E68,0x7F800000); p+=2; pb_push1(p,0x1D78,1); p+=2; pb_end(p); p=pb_begin(); pb_push_transposed_matrix(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_A(0),pb_IdentityMatrix); p+=17; pb_push_transposed_matrix(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_A(4),pb_IdentityMatrix); p+=17; pb_push_transposed_matrix(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_A(8),pb_IdentityMatrix); p+=17; pb_push_transposed_matrix(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_A(12),pb_IdentityMatrix); p+=17; /* pb_push1(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_ENABLE(0),0x2202); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_ENABLE(1),0x2202); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_ENABLE(2),0x2202); p+=2; pb_push1(p,NV20_TCL_PRIMITIVE_3D_CLIP_PLANE_ENABLE(3),0x2202); p+=2; */ pb_push4f(p,0x09D0,0.0f,0.0f,1.0f,0.0f); p+=5; pb_push1(p,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_CONST_ID,0x0000003C); p+=2; //set shader constants cursor at C-36 pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_CONST_X,12); //loads C-36, C-35 & C-34 memcpy(p,pb_FixedPipelineConstants,12*4); p+=12; //used by common xbox shaders, but I doubt we will use them. //(also usually C-37 is screen center offset Decals vector & c-38 is Scales vector) pb_end(p); //Frame buffers creation //So far, tested only with 640*480 32 bits (default openxdk res) //Even if it's a waste of memory, for now, we will leave the openxdk (& SDL) //default frame buffer untouched. debugPrint (& SDL) will still target it. //We will provide functions pb_show_debug_screen() and pb_show_front_screen() //in order to let user (developper) toggle between screens at will. pb_FrameBuffersAddr=0; pb_DepthStencilAddr=0; pb_DepthStencilLast=-2; vm=XVideoGetMode(); if (vm.bpp==32) pb_GPUFrameBuffersFormat=0x128;//A8R8G8B8 else pb_GPUFrameBuffersFormat=0x113; //R5G6B5 (0x123 if D24S8 used, bpp 16 untested) pb_ZScale=16777215.0f; //D24S8 Width=vm.width; Height=vm.height; BackBufferCount=2; //triple buffering technic! //allows dynamic details adjustment pb_FrameBuffersCount=BackBufferCount+1; //front buffer + back buffers pb_FrameBuffersWidth=Width; pb_FrameBuffersHeight=Height; HScale=1; VScale=1; HSize=HScale*Width; //Total width VSize=VScale*Height; //Total height //Front and back buffers (tile #0) FrameBufferCount=BackBufferCount+1; //pitch is the gap between start of a pixel line and start of next pixel line //(not necessarily the size of a pixel line, because of hardware optimization) Pitch=(((vm.bpp*HSize)>>3)+0x3F)&0xFFFFFFC0; //64 units aligned pb_FrameBuffersPitch=Pitch; //look for a standard listed pitch value greater or equal to theoretical one for(i=0;i<16;i++) { if (pb_TilePitches[i]>=Pitch) { Pitch=pb_TilePitches[i]; break; } } Size=Pitch*VSize; //verify 64 bytes alignment for size of a frame buffer if (Size&(64-1)) debugPrint("pb_init: FBSize is not well aligned.\n"); pb_FBSize=Size; //multiply size by number of physical frame buffers in order to obtain global size FBSize=Size*FrameBufferCount; //Huge alignment enforcement (16 Kb aligned!) for the global size FBSize=(FBSize+0x3FFF)&0xFFFFC000; FBAddr=(DWORD)MmAllocateContiguousMemoryEx(FBSize,0,0x03FFB000,0x4000,0x404); //NumberOfBytes,LowestAcceptableAddress,HighestAcceptableAddress,Alignment OPTIONAL,ProtectionType pb_FBGlobalSize=FBSize; pb_FrameBuffersAddr=FBAddr; if (!FBAddr) { pb_kill(); return -11; } for(i=0;i>3)+0x3F)&0xFFFFFFC0; //64 units aligned pb_DepthStencilPitch=Pitch; //look for a standard listed pitch value greater or equal to theoretical one for(i=0;i<16;i++) { if (pb_TilePitches[i]>=Pitch) { Pitch=pb_TilePitches[i]; break; } } Size=Pitch*VSize; //verify 64 bytes alignment for size of a frame buffer if (Size&(64-1)) debugPrint("pb_init: DSSize is not well aligned.\n"); pb_DSSize=Size; //multiply size by number of physical frame buffers in order to obtain global size DSSize=Size*FrameBufferCount; //Huge alignment enforcement (16 Kb aligned!) for the global size DSSize=(DSSize+0x3FFF)&0xFFFFC000; DSAddr=(DWORD)MmAllocateContiguousMemoryEx(FBSize,0,0x03FFB000,0x4000,0x404); //NumberOfBytes,LowestAcceptableAddress,HighestAcceptableAddress,Alignment OPTIONAL,ProtectionType pb_DepthStencilAddr=DSAddr; if (!DSAddr) { pb_kill(); return -11; } pb_DSAddr=DSAddr; pb_assign_tile( 1, //int tile_index, pb_DepthStencilAddr&0x03FFFFFF, //DWORD tile_addr, DSSize, //DWORD tile_size, Pitch, //DWORD tile_pitch, 0, //DWORD tile_z_start_tag, 0, //DWORD tile_z_offset, 0x84000001 //DWORD tile_flags (0x04000000 for 32 bits) ); if (pb_ExtraBuffersCount) { //Extra back buffers (tile #2) //pitch is the gap between start of a pixel line and start of next pixel line //(not necessarily the size of a pixel line, because of hardware optimization) Pitch=(((vm.bpp*HSize)>>3)+0x3F)&0xFFFFFFC0; //64 units aligned //look for a standard listed pitch value greater or equal to theoretical one for(i=0;i<16;i++) { if (pb_TilePitches[i]>=Pitch) { Pitch=pb_TilePitches[i]; break; } } Size=Pitch*VSize; //verify 64 bytes alignment for size of a frame buffer if (Size&(64-1)) debugPrint("pb_init: EXSize is not well aligned.\n"); //multiply size by number of physical frame buffers in order to obtain global size EXSize=Size*pb_ExtraBuffersCount; //Huge alignment enforcement (16 Kb aligned!) for the global size EXSize=(EXSize+0x3FFF)&0xFFFFC000; EXAddr=(DWORD)MmAllocateContiguousMemoryEx(EXSize,0,0x03FFB000,0x4000,0x404); //NumberOfBytes,LowestAcceptableAddress,HighestAcceptableAddress,Alignment OPTIONAL,ProtectionType if (!EXAddr) { pb_kill(); return -11; } for(i=0;i8 (0-511) pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; //prepare subprogram call (wait/makespace, will obtain null status) pb_push1(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,1); p+=2; //set parameter for subprogram (TRUE) pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETNOISE); p+=2; //call subprogID PB_SETNOISE: Dxt1NoiseEnable=TRUE pb_push1(p,NV20_TCL_PRIMITIVE_3D_CULL_ENABLE,3); p+=2; //bit0:OcclusionCullEnable=TRUE & bit1:StencilCullEnable=TRUE pb_push1(p,NV20_TCL_PRIMITIVE_3D_WAIT_MAKESPACE,0); p+=2; //prepare subprogram call (wait/makespace, will obtain null status) pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PGRAPH_DEBUG_5,NV_PGRAPH_DEBUG_5_ZCULL_SPARE2_ENABLED); p+=3; //set parameters A & B: DoNotCullUncompressed=FALSE (|8 otherwise) pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(ParamA)=ParamB if (VIDEOREG(NV_PBUS_ROM_VERSION)&NV_PBUS_ROM_VERSION_MASK) pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PGRAPH_UNKNOWN_400B80,(0x45EAD10F&~0x18100000)); //RopZCmpAlwaysRead=FALSE (bit27) & RopZRead=FALSE (bit20) else pb_push2(p,NV20_TCL_PRIMITIVE_3D_PARAMETER_A,NV_PGRAPH_UNKNOWN_400B80,(0x45EAD10E&~0x18100000)); p+=3; pb_push1(p,NV20_TCL_PRIMITIVE_3D_FIRE_INTERRUPT,PB_SETOUTER); p+=2; //calls subprogID PB_SETOUTER: does VIDEOREG(ParamA)=ParamB pb_end(p); //various intial settings (texture stages states) p=pb_begin(); pb_push1(p,0x1b68,0); p+=2; //texture stage 1 BumpEnvMat00=0.0f (stage +1 because no pixel shader used yet) pb_push1(p,0x1b6c,0); p+=2; //texture stage 1 BumpEnvMat01=0.0f pb_push1(p,0x1b70,0); p+=2;//texture stage 1 BumpEnvMat11=0.0f pb_push1(p,0x1b74,0); p+=2; //texture stage 1 BumpEnvMat10=0.0f pb_push1(p,0x1b78,0); p+=2; //texture stage 1 BumpEnvMatLightScale=0.0f pb_push1(p,0x1b7c,0); p+=2; //texture stage 1 BumpEnvMatLightOffset=0.0f pb_push3(p,0x03c0,0,0,0); p+=4; //texture stages 0 TexCoordIndex="passthru" pb_push1(p,0x1b24,0); p+=2; //texture stage 0 BorderColor=0x000000 pb_push1(p,0x0ae0,0); p+=2; //texture stage 0 ColorKeyColor=0x000000 pb_push1(p,0x1ba8,0); p+=2; //texture stage 2 BumpEnvMat00=0.0f (stage +1 because no pixel shader used yet) pb_push1(p,0x1bac,0); p+=2; //texture stage 2 BumpEnvMat01=0.0f pb_push1(p,0x1bb0,0); p+=2;//texture stage 2 BumpEnvMat11=0.0f pb_push1(p,0x1bb4,0); p+=2; //texture stage 2 BumpEnvMat10=0.0f pb_push1(p,0x1bb8,0); p+=2; //texture stage 2 BumpEnvMatLightScale=0.0f pb_push1(p,0x1bbc,0); p+=2; //texture stage 2 BumpEnvMatLightOffset=0.0f pb_push3(p,0x03d0,0,0,0); p+=4; //texture stages 1 TexCoordIndex="passthru" pb_push1(p,0x1b64,0); p+=2; //texture stage 1 BorderColor=0x000000 pb_push1(p,0x0ae4,0); p+=2; //texture stage 1 ColorKeyColor=0x000000 pb_push1(p,0x1be8,0); p+=2; //texture stage 3 BumpEnvMat00=0.0f (stage +1 because no pixel shader used yet) pb_push1(p,0x1bec,0); p+=2; //texture stage 3 BumpEnvMat01=0.0f pb_push1(p,0x1bf0,0); p+=2;//texture stage 3 BumpEnvMat11=0.0f pb_push1(p,0x1bf4,0); p+=2; //texture stage 3 BumpEnvMat10=0.0f pb_push1(p,0x1bf8,0); p+=2; //texture stage 3 BumpEnvMatLightScale=0.0f pb_push1(p,0x1bfc,0); p+=2; //texture stage 3 BumpEnvMatLightOffset=0.0f pb_push3(p,0x03e0,0,0,0); p+=4; //texture stages 2 TexCoordIndex="passthru" pb_push1(p,0x1ba4,0); p+=2; //texture stage 2 BorderColor=0x000000 pb_push1(p,0x0ae8,0); p+=2; //texture stage 2 ColorKeyColor=0x000000 pb_push3(p,0x03f0,0,0,0); p+=4; //texture stages 3 TexCoordIndex="passthru" pb_push1(p,0x1be4,0); p+=2; //texture stage 3 BorderColor=0x000000 pb_push1(p,0x0aec,0); p+=2; //texture stage 3 ColorKeyColor=0x000000 pb_end(p); memset((DWORD *)pb_FBAddr[0],0,pb_FBSize); memset((DWORD *)pb_DSAddr,0,pb_DSSize); pb_back_index=1; //frame buffer #1 is the back buffer for now pb_target_back_buffer(); //tells GPU what is the frame buffer target pb_front_index=0; //frame buffer #0 is the front buffer for now pb_show_front_screen(); //show it return 0; } //enqueues shaders micro-code into push buffer stream //(not recommended for pixel shader: slow and redundant) DWORD *pb_push_mcode(DWORD *p,DWORD *mcode) { DWORD size; if (((*mcode)&0xFFFF0000)!=0x43210000) //pixel shader registers values { //Pixel shader initialization (on xbox it's just registers initialization) //1-8 stages where (alpha and rgb processed in parallel) //2x4 inputs redirected to (a,b,c,d) can produce 2x3 outputs (a*b,c*d or a*b+c*d) //redirected to v0-v1, t0-t3, or r0-r1 (r0=final result at final stage) pb_push2(p,NV20_TCL_PRIMITIVE_3D_RC_COLOR0,pb_gpu_registers[48],pb_gpu_registers[49]); p+=3; //PSFinalCombinerC0 & C1 pb_push1(p,NV20_TCL_PRIMITIVE_3D_TX_SHADER_CULL_MODE,pb_gpu_registers[50]); p+=2; //PSCompareMode (0 means fragment killed if r<0 or s<0 or t<0 or q<0, used in clipplane mode) pb_push1(p,NV20_TCL_PRIMITIVE_3D_TX_SHADER_OP,pb_gpu_registers[51]); p+=2; //PSTextureModes=1 (1<<(stage*5) is project 2D: argb=texture(r/q,s/q) usually q=1.0f) pb_push1(p,NV20_TCL_PRIMITIVE_3D_TX_SHADER_DOTMAPPING,pb_gpu_registers[52]); p+=2; //PSDotMapping (0 means [0,255]argb from texture=>[0.0,1.0](r,g,b)) pb_push1(p,NV20_TCL_PRIMITIVE_3D_TX_SHADER_PREVIOUS,pb_gpu_registers[53]); p+=2; //PSInputTextureSource (usual value for 4 stages: 0x00210000, what previous stage each stage uses) pb_push1(p,NV20_TCL_PRIMITIVE_3D_RC_ENABLE,pb_gpu_registers[54]); p+=2; //PSCombinerCount (stages usage count=1, r0.a LSB controls mux, C0's & C1's may be different) pb_push(p++,NV20_TCL_PRIMITIVE_3D_RC_IN_ALPHA(0),8); memcpy(p,&pb_gpu_registers[0],8*4); p+=8; //8 PSAlphaInputs //Inputs: 8x 0xaabbccdd //0=0 1=c0 2=c1 3=fog.rgb 4=v0 5=v1 8=t0 0xb=t3 0xc=r0 0xd=r1 0x10=x.a default=|0.rgb| //0x20=1-|x| 0x40=2*max(0,x)-1("_bx2") 0x60=1-2*max(0,x) 0x80=max(0,x)-0.5f("_bias") 0xa0=0.5f-max(0,x) 0xc0=x 0xf0=-x pb_push(p++,NV20_TCL_PRIMITIVE_3D_RC_OUT_ALPHA(0),8); memcpy(p,&pb_gpu_registers[8],8*4); p+=8; //8 PSAlphaOutputs pb_push(p++,NV20_TCL_PRIMITIVE_3D_RC_IN_RGB(0),8); memcpy(p,&pb_gpu_registers[16],8*4); p+=8; //8 PSRGBInputs pb_push(p++,NV20_TCL_PRIMITIVE_3D_RC_OUT_RGB(0),8); memcpy(p,&pb_gpu_registers[24],8*4); p+=8; //8 PSRGBOutputs //Outputs: 8x 0xFlags+<> <:a*b dest >:c*d dest +:a*b+c*d dest with 0xc=r0 0=discared, i.e no destination //Flags: 2(ab)/1(cd)="* is replaced with dot product", 4="+ is replaced with (r0.a LSB or MSB not set)?(a*b):(c*d)" //Flags: 8=-0.5f (then) 0x10=*2.0f 0x20=*4.0f 0x40=*0.5f //Flags: 0x80(ab)/0x40(cd)=result.b propagates to result.a on rgb side (case of dp3 r0,?n,?n for example) pb_push(p++,NV20_TCL_PRIMITIVE_3D_RC_CONSTANT_COLOR0(0),16); memcpy(p,&pb_gpu_registers[32],16*4); p+=16; //8 C0's 8 C1's return p; } //enqueues a vertex shader setup: size=(*(mcode++))&0xFFFF; if (size>136*5+96*7+8) { debugPrint("pb_push_mcode: Wrong vertex shader size\n"); return NULL; } memcpy(p,mcode,size*4); p+=size; return p; } //converts pseudo-code register into encoded xbox gpu pixel shader input register static int pb_preg2psreg(struct s_PseudoReg *pReg) { int reg=0xc; //r0 switch(pReg->reg) { case 8: reg=0xc+pReg->num; break; //r0-r1 (side effect: r2=0(0) r3=fog.rgb r4=v0 r5=v1 r6=v1r0sum(0xe) r7=EFprod(0xf)) case 9: reg=4+pReg->num; break; //v0-v1 (side effect: v2=v1r0sum(0xe) v3=EFprod(0xf) v4=c0 v5=c1 v6=0 v7=0) case 0xa: reg=1+pReg->num; //c0-c1 (ps constants Cn are 0xaarrggbb dwords) //Pseudo code created by psa.exe allows to define C0-C7 but //NVidia pixel shaders only refers to C0-C1, but they may be different //at each stage. So there is not only one way to map them. //Since this function supports only 1 stage, we use only c0-c1 (c2-c3 for 2nd stage, later, eventually) //thus, we can choose to have c4-c7 match non standard xbox gpu specific registers at any stage if (pReg->num==4) reg=0; //c4=zero if (pReg->num==5) reg=3; //c5=fog.rgb if (pReg->num==6) reg=0xe; //c6=v1r0sum if (pReg->num==7) reg=0xf; //c7=EFprod (see final combiner comment below) break; case 0xb: reg=8+pReg->num; break; //t0-t3 } switch(pReg->mod) { case 0: reg|=0xc0; break; //x case 1: reg|=0xe0; break; //-x case 2: reg|=0x80; break; //x_bias (x-0.5f) case 3: reg|=0xa0; break; //-x_bias -(x-0.5f) case 4: reg|=0x40; break; //x_bx2 (|x|*2.0f-1.0f) case 5: reg|=0x60; break; //-x_bx2 -(|x|*2.0f-1.0f) case 6: reg|=0x20; break; //1-|x| (0x00=|x|) case 7: debugPrint("pb_preg2psreg: ?n_x2 modifier is not supported\n"); break; //x_x2 (|x|*2) is not supported default: debugPrint("pb_preg2psreg: Unrecognized modifier %d\n",pReg->mod); break; } return reg; } //reads data from pseudo-code stream and fills in structure static void pb_read_pregs(DWORD *pcode, struct s_PseudoRegs *pRegs, int n) { DWORD code; struct s_PseudoReg *pReg; pRegs->n=n; if (n>=1) //dest { code=*(pcode++); pReg=&pRegs->dest; //ps: 8=r 9=v 0xa=c 0xb=t pReg->reg=(code>>28)&0xf; //vs: 8=r 0xa=c 0xb=a 0xc=oP(oP0=oPos oP1=oFog oP2=oPts) 0xd=oD 0xe=oT pReg->num=(code>> 0)&0xf; pReg->msk=(code>>16)&0xf; //bit0=x/r bit1=y/g bit2=z/b bit3=w/a (need to reverse order for xbox gpu) pReg->msk=((pReg->msk&8)>>3)|((pReg->msk&4)>>1)|((pReg->msk&2)<<1)|((pReg->msk&1)<<3); if (pReg->reg==8) pb_tmp_registers[pReg->num]=1; //markup for actually used temporary registers } if (n>=2) //src0 { code=*(pcode++); pReg=&pRegs->src0; //ps: 8=r 9=v 0xa=c 0xb=t pReg->reg=(code>>28)&0xf; //vs: 8=r 9=v 0xa=c 0xb=a pReg->num=(code>> 0)&0xf; pReg->mod=(code>>24)&0xf; //0=x 1=-x (ps: 2=x_bias 3=-x_bias 4=x_bx2 5=-x_bx2 6=1-x 7=x_x2(not supported)) pReg->swz=(code>>16)&0xff; //.p0p1p2p3=>p3p2p1p0 with 00=x/r 01=y/g 10=z/b 11=w/a (need to reverse order for xbox gpu) pReg->swz=((pReg->swz&0xc0)>>6)|((pReg->swz&0x30)>>2)|((pReg->swz&0xc)<<2)|((pReg->swz&3)<<6); pReg->idx=(code>>13)&1; //vs: if set, means cn to be replaced with c[a0.x+n] } if (n>=3) //src1 { code=*(pcode++); pReg=&pRegs->src1; //ps: 8=r 9=v 0xa=c 0xb=t pReg->reg=(code>>28)&0xf; //vs: 8=r 9=v 0xa=c 0xb=a pReg->num=(code>> 0)&0xf; pReg->mod=(code>>24)&0xf; //0=x 1=-x (ps: 2=x_bias 3=-x_bias 4=x_bx2 5=-x_bx2 6=1-x 7=x_x2(not supported)) pReg->swz=(code>>16)&0xff; //.p0p1p2p3=>p3p2p1p0 with 00=x/r 01=y/g 10=z/b 11=w/a (need to reverse order for xbox gpu) pReg->swz=((pReg->swz&0xc0)>>6)|((pReg->swz&0x30)>>2)|((pReg->swz&0xc)<<2)|((pReg->swz&3)<<6); pReg->idx=(code>>13)&1; //vs: if set, means cn to be replaced with c[a0.x+n] } if (n>=4) //src2 { code=*(pcode++); pReg=&pRegs->src2; //ps: 8=r 9=v 0xa=c 0xb=t pReg->reg=(code>>28)&0xf; //vs: 8=r 9=v 0xa=c 0xb=a pReg->num=(code>> 0)&0xf; pReg->mod=(code>>24)&0xf; //0=x 1=-x (ps: 2=x_bias 3=-x_bias 4=x_bx2 5=-x_bx2 6=1-x 7=x_x2(not supported)) pReg->swz=(code>>16)&0xff; //.p0p1p2p3=>p3p2p1p0 with 00=x/r 01=y/g 10=z/b 11=w/a (need to reverse order for xbox gpu) pReg->swz=((pReg->swz&0xc0)>>6)|((pReg->swz&0x30)>>2)|((pReg->swz&0xc)<<2)|((pReg->swz&3)<<6); pReg->idx=(code>>13)&1; //vs: if set, means cn to be replaced with c[a0.x+n] } } //sets usual parts of vertex shader micro-code (instruction independant parts) static int pb_set_mcode(DWORD *p,struct s_PseudoRegs *pRegs) { //xbox gpu micro-code format: //renouveau constants: //| | | | | | | | | DWORD#0 (0) //| |scalar#|vector#|(0-95)const_src|inp_src| source0_high | DWORD#1 //|source0_low| source1 | source2_high | DWORD#2 //|src2low|vtmpmsk|temp_id|stmpmsk|destmsk|x| (const) dest |p|i| | DWORD#3 //'x' bit allows to choose a constant as destination. //Shader must be declared with a special type previously //in order to get this priviledge and runs much slower. //x=1 : destination is not a constant register //x=0 : destination is a constant register (4 bits dest field becomes 8 bits const dest field) //The way I describe things (using c,v,r characters): //| | | | | | | | | DWORD#0 (0) //| |sc_code|op_code|(0-191) c_numbr|v_numbr|m|source0_swizzle| DWORD#1 (96=>C0 on xbox) //|r_numbr|cvr|m|source1_swizzle|r_numbr|cvr|m|source2_swizzle|r_n DWORD#2 //r? dest: //umbr|cvr|dst_msk|r_numbr|sdstmsk|0 0 0 0|1|1 1 1 1 1 1 1 1|0|i| | DWORD#3 //o? dest: (o0=oPos o1-2=oT6-7(n/a) o3-4=oD0-1(ff) o5=oFog o6=oPts o7-8=oT4-5(bf) o9-12=oT0-3) //umbr|cvr|0 0 0 0|0 1 1 1|0 0 0 0|dst_msk|1|0 0 0 0|o_numbr|s|i| | DWORD#3 //c? dest: (shaders that can write into constants run slower and have special type) //umbr|cvr|0 0 0 0|0 1 1 1|0 0 0 0|dst_msk|0|(0-191) c_numbr|s|i| | DWORD#3 (96=>C0 on xbox) //a0 dest: (only allowed in instruction mov a0.x,...) //| |cvr|0 0 0 0|0 1 1 1|0 0 0 0|0 0 0 0|1 1 1 1 1 1 1 1 1|0|i| | DWORD#3 //i: 0=cn 1=c[a0.x+n] (if any constant is used as any of the sources) //s: set if scalar function result is expected in destination //no c: c_numbr=0 //no v: v_numbr=0 //m: 0=x 1=-x //cvr: (can't set more than 1 c and more than 1 v as src) //01=r //10=v //11=c //missing src: m=0(x) swizzle=00011011(.xyzw) r_numbr=0(0) cvr=10(v) DWORD src0,src1,src2; *(p+0)=NV20_VP_INST0_KNOWN; //always 0 *(p+1)=0; *(p+2)=0; *(p+3)=0; if (pRegs->n<2) //it's a nop { //src0, src1 & src2 are missing (set them to v0.xyzw) *(p+1)|=0x1b; *(p+2)|=(NV20_VP_SRC_REG_TYPE_INPUT<>NV20_VP_SRC2_HIGH_SHIFT)<dest.reg) //8=r 0xa=c 0xb=a 0xc=oP(oP0=oPos oP1=oFog oP2=oPts) 0xd=oD 0xe=oT { case 8 : *(p+3)|=0x00000ff8|(pRegs->dest.msk<dest.num<dest.msk<dest.num<dest.msk<dest.num?(pRegs->dest.num==1?NV20_VP_INST_DEST_FOG:NV20_VP_INST_DEST_PTS):NV20_VP_INST_DEST_POS)<dest.msk<dest.num?NV20_VP_INST_DEST_COL1:NV20_VP_INST_DEST_COL0)<dest.msk<dest.num<4)?NV20_VP_INST_DEST_TC(pRegs->dest.num):((pRegs->dest.num<6)?pRegs->dest.num+3:pRegs->dest.num-5))<src0.mod*NV20_VP_SRC_REG_NEGATE)|(pRegs->src0.swz<src0.reg) //8=r 9=v 0xa=c 0xb=a { case 8 : src0|=(NV20_VP_SRC_REG_TYPE_TEMP<src0.num<src0.num<src0.num+96)<>NV20_VP_SRC0_HIGH_SHIFT)<src0.idx*NV20_VP_INST_INDEX_CONST; if (pRegs->n==2) { //src1 & src2 are missing (set them to v0.xyzw) *(p+2)|=((0x1b<>NV20_VP_SRC2_HIGH_SHIFT)<src1.mod*NV20_VP_SRC_REG_NEGATE)|(pRegs->src1.swz<src1.reg) //8=r 9=v 0xa=c 0xb=a { case 8 : src1|=(NV20_VP_SRC_REG_TYPE_TEMP<src1.num<src1.num<src1.num+96)<src1.idx*NV20_VP_INST_INDEX_CONST; if (pRegs->n==3) { //src2 is missing (set it to v0.xyzw) *(p+2)|=((0x1b<>NV20_VP_SRC2_HIGH_SHIFT)<src2.mod*NV20_VP_SRC_REG_NEGATE)|(pRegs->src2.swz<src2.reg) //8=r 9=v 0xa=c 0xb=a { case 8 : src2|=(NV20_VP_SRC_REG_TYPE_TEMP<src2.num<src2.num<src2.num+96)<>NV20_VP_SRC2_HIGH_SHIFT)<src2.idx*NV20_VP_INST_INDEX_CONST; return 0; } //converts shaders pseudo-code into xbox gpu micro-code //(not recommended for pixel shader: slow and incomplete) DWORD *pb_pcode2mcode(const DWORD *pseudocode) { DWORD *p; DWORD constant; DWORD size; DWORD *pcode; int i,n; struct s_PseudoRegs sRegs; pcode=(DWORD *)pseudocode; if (pcode==NULL) { debugPrint("pb_pcode2mcode: NULL parameter\n"); return NULL; } //pb_tmp_registers will tell us unused registers. //this array is updated by pb_read_regs() when tmp registers are detected as destination memset(pb_tmp_registers,0,sizeof(pb_tmp_registers)); if (*pcode==0xffff0101) //ps_1_1 { pcode++; //currently supported (not a lot, but manual ps registers setting is possible): //- only 1 stage (1 or 2 instructions to set r0, with or without 1 'tex t0' instruction) //- modifier -?n //- modifier ?n_bias (-0.5f) //- modifier ?n_bx2 (*2.0f) //- modifier 1-|?n| //- def cn, r, g, b, a //- nop //- tex t0 //- mov r0, ?n (r0=?n) //- mul r0, ?n, ?n (r0=?n*?n) //- dp3 r0, ?n, ?n (r0=?n.?n) //- add r0, ?n, ?n (r0=?n+?n) //- sub r0, ?n, ?n (r0=?n-n) //- mad r0, ?n, ?n, ?n (r0=?n*?n+?n) //- lrp r0, src0, src1, src2 (r0=src0*src1+(1-src0)*src2) //- cnd r0, r0.a, src1, src2 (r0=(r0.a>0.5f)?src1:src2) (if r0.a MSB is used for mux) //- coherent destination mask & swizzle (no swizzle or .rgba, .xyzw, .a, .x, .rgb, .xyz for separate rgb/alpha processing) p=&pb_gpu_registers[0]; //It's recommended to learn initializing registers oneself //in order to avoid resetting most of this -probably useless- default values memset(&pb_gpu_registers[0],0,sizeof(pb_gpu_registers)); p[0] =0xd4301010; //PSAlphaInput for stage 0: a.a=v0.a b.a=1.a-|0.a| p[8] =0x000000c0; //PSAlphaOutput for stage 0: r0.a=a*b p[16]=0xc4200000; //PSRGBInput for stage 0: a.rgb=v0.rgb b.rgb=1.rgb-|0.rgb| p[24]=0x000000c0; //PSRGBOutput for stage 0: r0.rgb=a*b //p[32] //C0's constants //p[40] //C1's constants //p[48] //final combiner C0 constant //p[49] //final combiner C1 constant //p[50] //PSCompareMode (used only for texture mode clipplane) //p[51] //PSTextureModes (1 is project 2D: argb=texture(r/q,s/q) usually q=1.0f) //p[52] //PSDotMapping (0 means [0,255]argb from texture=>[0.0,1.0](r,g,b)) //p[53] //PSInputTextureSource (most logical value is 0x00210000 when texture stages 2 & 3 are used) p[54]=0x11101; //PSCombinerCount ("stages usage count" | "C0 & C1 may be different from stage to stage" | "r0.a MSB used for mux") //These default settings do "mov r0,v0" //'final combiner' is an additional invisible (free) stage doing this: //final pixel.rgb = A * B + (1 - A) * C + D //final pixel.alpha = G.b or G.a (.a modifier must be used if you want .a) //Also all values are clamped to 0..1 (negative values become zero) //Inner registers NV20_TCL_PRIMITIVE_3D_RC_FINAL0 and following one //define inputs and modifiers for the 7 parameters A,B,C,D and E,F,G,? (?=0x80, unknown) //Here are a few useful values depending what you want to do: //fog on & specular on : 0x130e0300,0x00001c80 (means pixel.rgb=fog.a * (r0.rgb + v1.rgb) + (1 - fog.a) * fog.rgb & pixel.a=r0.a) //fog on & specular off : 0x130c0300,0x00001c80 (means pixel.rgb=fog.a * r0.rgb + (1 - fog.a) * fog.rgb & pixel.a=r0.a) //fog off & specular on : 0x0000000e,0x00001c80 (means pixel.rgb=r0.rgb + v1.rgb & pixel.a=r0.a) //fog off & specular off : 0x0000000c,0x00001c80 (means D=r0.rgb & G=r0.a, so final pixel.rgb=r0.rgb & pixel.a=r0.a) //These special read-only registers are also available at final combiner stage (maybe also at any stage?): //zero = 0 (0x0 is the numeric code for this register, modifier is bits 7-4, mapped to C4) //fog = fog (0x3, fog.rgb returns the fog color inner register value, mapped to pseudocode C5 -fog.a is fog transparency, coming from fog table, I guess-) //v1r0sum = r0 + v1 (0xe, I've mapped it to pseudocode C6 in pcode2mcode, useful when specular v1 is to be used) //EFprod = E * F (0xf, I've mapped it to pseudocode C7 in pcode2mcode, useful for pixel shader optimization, i.e reduce number of stages) //Codes for normal registers: //C0 => 0x1 //C1 => 0x2 //v0 => 0x4 //v1 => 0x5 //t0 => 0x8 //t1 => 0x9 //t2 => 0xa //t3 => 0xb //r0 => 0xc //r1 => 0xd //Modifiers (Or it to code above): //default 0x00=|0.rgb| 0x10=x.a //0x20=1-|x| 0x40=2*max(0,x)-1("_bx2") 0x60=1-2*max(0,x) 0x80=max(0,x)-0.5f("_bias") 0xa0=0.5f-max(0,x) 0xc0=x 0xf0=-x while (*pcode!=0x0000ffff) { switch(*(pcode++)) { case 0x00000000: //nop case 0x40000000: //+nop... break; case 0x00000001: //mov r0, ?n (r0=?n) case 0x40000001: //+mov... pb_read_pregs(pcode,&sRegs,2); pcode+=2; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if (sRegs.dest.msk&1) p[0]=0x10301010|(pb_preg2psreg(&sRegs.src0)<<24); //PSAlphaInput for stage 0: a.a=?.a b.a=1-|0.a| if ((sRegs.dest.msk&0xe)==0xe) p[16]=0x00200000|(pb_preg2psreg(&sRegs.src0)<<24); //PSRGBInput for stage 0: a.rgb=?.rgb b.rgb=1.rgb-|0.rgb| break; case 0x00000002: //add r0, ?n, ?n (r0=?n+?n) case 0x40000002: //+add... pb_read_pregs(pcode,&sRegs,3); pcode+=3; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if (sRegs.dest.msk&1) { p[0]=0x10301030|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<8); //PSAlphaInput for stage 0: a.a=?.a b.a=1.a-|0.a| c.a=?.a d=1.a-|0.a| p[8]=0x00000c00; //PSAlphaOutput for stage 0: r0.a=a*b+c*d } if ((sRegs.dest.msk&0xe)==0xe) { p[16]=0x00200020|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<8); //PSRGBInput for stage 0: a.rgb=?.rgb b.rgb=1.rgb-|0.rgb| c.rgb=?.rgb d.rgb=1.rgb-|0.rgb| p[24]=0x00000c00; //PSRGBOutput for stage 0: r0.rgb=a*b+c*d } break; case 0x00000003: //sub r0, ?n, ?n (r0=?n-?n) case 0x40000003: //+sub... pb_read_pregs(pcode,&sRegs,3); pcode+=3; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if (sRegs.src1.mod<6) sRegs.src1.mod^=1; //inverts src1 sign else { debugPrint("pb_pcode2mcode: sub not supported if src1 has 1-|x| modifier\n"); return NULL; } if (sRegs.dest.msk&1) { p[0]=0x10301030|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<8); //PSAlphaInput for stage 0: a.a=?.a b.a=1.a-|0.a| c.a=?.a d=1.a-|0.a| p[8]=0x00000c00; //PSAlphaOutput for stage 0: r0.a=a*b+c*d } if ((sRegs.dest.msk&0xe)==0xe) { p[16]=0x00200020|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<8); //PSRGBInput for stage 0: a.rgb=?.rgb b.rgb=1.rgb-|0.rgb| c.rgb=?.rgb d.rgb=1.rgb-|0.rgb| p[24]=0x00000c00; //PSRGBOutput for stage 0: r0.rgb=a*b+c*d } break; case 0x00000004: //mad r0, ?n, ?n, ?n (r0=?n*?n+?n) case 0x40000004: //+mad... pb_read_pregs(pcode,&sRegs,4); pcode+=4; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if (sRegs.dest.msk&1) { p[0]=0x10101030|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16)|(pb_preg2psreg(&sRegs.src2)<<8); //PSAlphaInput for stage 0: a.a=?.a b.a=?.a c.a=?.a d.a=1-|0.a| p[8]=0x00000c00; //PSAlphaOutput for stage 0: r0.a=a*b+c*d } if ((sRegs.dest.msk&0xe)==0xe) { p[16]=0x00000020|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16)|(pb_preg2psreg(&sRegs.src2)<<8); //PSRGBInput for stage 0: a.rgb=?.rgb b.rgb=?.rgb c.rgb=?.rgb d.rgb=1-|0.rgb| p[24]=0x00000c00; //PSRGBOutput for stage 0: r0.rgb=a*b+c*d } break; case 0x00000005: //mul r0, ?n, ?n (r0=?n*?n) case 0x40000005: //+mul... pb_read_pregs(pcode,&sRegs,3); pcode+=3; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if (sRegs.dest.msk&1) p[0]=0x10101010|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16); //PSAlphaInput for stage 0: a.a=?.a b.a=?.a if ((sRegs.dest.msk&0xe)==0xe) p[16]=0x00000000|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16); //PSRGBInput for stage 0: a.rgb=?.rgb b.rgb=?.rgb break; case 0x00000008: //dp3 r0, ?n, ?n (r0=?n.?n) case 0x40000008: //+dp3... pb_read_pregs(pcode,&sRegs,3); pcode+=3; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if ((sRegs.dest.msk&0xf)==0xe) //dp3 r0.xyz, ... { p[16]=0x00000000|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16); //PSRGBInput for stage 0: a.rgb=?.rgb b.rgb=?.rgb p[24]=0x000020c0; //PSRGBOutput for stage 0: r0.rgb=a.b (dot product) } if ((sRegs.dest.msk&0xf)==0xf) //dp3 r0, ... { p[0]=0x10101010; p[8]=0x00000000; //PSAlphaOutput for stage 0: discarded (we will use the b->a propagate bit on rgb side) p[16]=0x00000000|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16); //PSRGBInput for stage 0: a.rgb=?.rgb b.rgb=?.rgb p[24]=0x000820c0; //PSRGBOutput for stage 0: r0.rgb=a.b (dot product) (and r0.b propagates to r0.a) } break; case 0x00000012: //lrp r0, src0, src1, src2 (r0=src0*src1+(1-src0)*src2) case 0x40000012: //+lrp... pb_read_pregs(pcode,&sRegs,4); pcode+=4; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if (sRegs.src0.mod) { debugPrint("pb_pcode2mcode(lrp): Unsupported source 0 modifier\n"); return NULL; } if (sRegs.dest.msk&1) { p[0]=0x10101030|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16)|(pb_preg2psreg(&sRegs.src2)<<8)|(pb_preg2psreg(&sRegs.src0)&0xf); //PSAlphaInput for stage 0: a.a=src0.a b.a=src1.a c.a=src2.a d.a=1-|src0.a| p[8]=0x00000c00; //PSAlphaOutput for stage 0: r0.a=a*b+c*d } if ((sRegs.dest.msk&0xe)==0xe) { p[16]=0x00000020|(pb_preg2psreg(&sRegs.src0)<<24)|(pb_preg2psreg(&sRegs.src1)<<16)|(pb_preg2psreg(&sRegs.src2)<<8)|(pb_preg2psreg(&sRegs.src0)&0xf); //PSRGBInput for stage 0: a.rgb=src0.rgb b.rgb=src1.rgb c.rgb=src2.rgb d.rgb=1-|src0.rgb| p[24]=0x00000c00; //PSRGBOutput for stage 0: r0.rgb=a*b+c*d } break; case 0x00000042: //tex t0 case 0x40000042: //+tex... //We assume tn has been replaced with texture color //because of a previous correct texture stage initialization pb_read_pregs(pcode,&sRegs,1); pcode+=1; if (sRegs.dest.num) { debugPrint("pb_pcode2mcode: Only 'tex t0' is supported\n"); return NULL; } p[51]=0x00000001; //PSTextureModes (1<<(stage*5) is project 2D: argb=texture(r/q,s/q) usually q=1.0f) break; case 0x00000050: //cnd r0, r0.a, src1, src2 (r0=(r0.a>0.5f)?src1:src2) (if r0.a MSB used for mux) case 0x40000050: //+cnd... pb_read_pregs(pcode,&sRegs,4); pcode+=4; if ((sRegs.dest.reg!=8)||(sRegs.dest.num!=0)) { debugPrint("pb_pcode2mcode: Unsupported destination register\n"); return NULL; } if (sRegs.dest.msk&1) { p[0]=0x10301030|(pb_preg2psreg(&sRegs.src2)<<24)|(pb_preg2psreg(&sRegs.src1)<<8); //PSAlphaInput for stage 0: a.a=src2.a b.a=1-|0.a| c.a=src1.a d.a=1-|0.a| p[8]=0x00004c00; //PSAlphaOutput for stage 0: r0.rgb=(r0.a MSB not set)?(a*b):(c*d)=(r0.a<=0.5f)?src2.rgb:src1.rgb } if ((sRegs.dest.msk&0xe)==0xe) { p[16]=0x00200020|(pb_preg2psreg(&sRegs.src2)<<24)|(pb_preg2psreg(&sRegs.src1)<<8); //PSRGBInput for stage 0: a.rgb=src2.rgb b.rgb=1.rgb-|0.rgb| c.rgb=src1.rgb d.rgb=1.rgb-|0.rgb| p[24]=0x00004c00; //PSRGBOutput for stage 0: r0.rgb=(r0.a MSB not set)?(a*b):(c*d)=(r0.a<=0.5f)?src2.rgb:src1.rgb } break; case 0x00000051: //def cn, r, g, b, a pb_read_pregs(pcode,&sRegs,1); pcode+=1; //converts 4 floats (r,g,b,a) into 1 dword 0xaarrggbb ([0,1.0f]=>[0,0xff]) constant=0; constant|=((DWORD)(255.0f*(*((float *)(pcode+3)))))<<24; constant|=((DWORD)(255.0f*(*((float *)(pcode+0)))))<<16; constant|=((DWORD)(255.0f*(*((float *)(pcode+1)))))<<8; constant|=((DWORD)(255.0f*(*((float *)(pcode+2)))))<<0; //distribute c0=>c0 stage 0, c1=>c1 stage 0, c2=>c0 stage 1, etc... p[32+8*(sRegs.dest.num&1)+(sRegs.dest.num>>1)]=constant; pcode+=4; break; default: debugPrint("pb_pcode2mcode: Unrecognized ps token #%08x\n",*(pcode-1)); return NULL; } } return &pb_gpu_registers[0]; } if (*pcode!=0xfffe0101) //vs_1_1 { debugPrint("pb_pcode2mcode: Shader version not supported\n"); return NULL; } //it's a vertex shader! (vs_1_1 should be entirely supported by code below -report any issue-) pcode++; pb_exp_constflag=0; //in order to not set taylor series exp macro constants up more than once pb_log_constflag=0; //in order to not set taylor series log macro constants up more than once n=0; //instructions counter (can't exceed 136 on xbox) p=&pb_gpu_programnc[1]; //push buffer compatible sequence setting up program and constants pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_PROGRAM_START_ID,1); *(p++)=0; //set run address of shader pb_push(p++,NV20_TCL_PRIMITIVE_3D_SHADER_TYPE,2); *(p++)=SHADER_TYPE_EXTERNAL; *(p++)=SHADER_SUBTYPE_REGULAR; //set shader vertex type (external shader, regular: not allowed to write into constants -faster-) pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_FROM_ID,1); *(p++)=0; //set cursor in order to load data into program area while(*pcode!=0x0000ffff) { if (n==136) { debugPrint("pb_pcode2mcode: Too many instructions: max=136 (including expanded macros)\n"); return NULL; } switch(*(pcode++)) { //standard pseudo-code: case 0x00000000: //nop case 0x40000000: //+nop pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; pb_read_pregs(pcode,&sRegs,0); pcode+=0; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_NOP<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; } p+=4; break; case 0x00000007: //rsq dest,src0 (scalar 1/sqrt(x) function) case 0x40000007: //+rsq pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; pb_read_pregs(pcode,&sRegs,2); pcode+=2; //src2 is used instead of src0 in scalar functions sRegs.n=4; sRegs.src2=sRegs.src0; sRegs.src0.reg=9; //v0.xyzw for unused src sRegs.src0.num=0; sRegs.src0.mod=0; sRegs.src0.swz=0x1b; sRegs.src0.idx=0; sRegs.src1.reg=9; //v0.xyzw for unused src sRegs.src1.num=0; sRegs.src1.mod=0; sRegs.src1.swz=0x1b; sRegs.src1.idx=0; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_RSQ<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; } p+=4; break; case 0x00000008: //dp3 dest,src0,src1 case 0x40000008: //+dp3 pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; pb_read_pregs(pcode,&sRegs,3); pcode+=3; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_DP3<=src1) case 0x4000000d: //+sge pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; pb_read_pregs(pcode,&sRegs,3); pcode+=3; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_SGE<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; p+=4; //mov ri.w, C-1.w pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; if (n==136) { debugPrint("pb_pcode2mcode(exp): Too many instructions: max=136 (including expanded macros)\n"); return NULL; } sRegs.n=2; sRegs.dest.msk=1; //.w sRegs.src0.reg=0xa; //c sRegs.src0.num=-1; sRegs.src0.swz=0xff; //.wwww sRegs.src0.mod=0; sRegs.src0.idx=0; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode(exp): Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_MOV<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; p+=4; //mul dest, ri.w, ri.x pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; if (n==136) { debugPrint("pb_pcode2mcode(exp): Too many instructions: max=136 (including expanded macros)\n"); return NULL; } pb_read_pregs(pcode,&sRegs,2); pcode+=2; //read dest again and preserve it sRegs.n=3; sRegs.src0.reg=8; //r sRegs.src0.num=i; sRegs.src0.swz=0xff; //.wwww sRegs.src0.mod=0; sRegs.src0.idx=0; sRegs.src1.reg=8; //r sRegs.src1.num=i; sRegs.src1.swz=0; //.xxxx sRegs.src1.mod=0; sRegs.src1.idx=0; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode(exp): Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_MUL<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; p+=4; //sub ri.x, ri.x, C-5.x (x=y-1) pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; if (n==136) { debugPrint("pb_pcode2mcode(log): Too many instructions: max=136 (including expanded macros)\n"); return NULL; } sRegs.n=3; sRegs.dest.msk=8; //.x sRegs.src0.reg=8; sRegs.src0.num=i; sRegs.src0.swz=0; //.xxxx sRegs.src0.mod=0; sRegs.src0.idx=0; //src2 is used instead of src1 for add sRegs.n=4; sRegs.src2.reg=0xa; //c sRegs.src2.num=-5; sRegs.src2.swz=0; //.xxxx sRegs.src2.mod=1; //- sRegs.src2.idx=0; sRegs.src1.reg=9; //v0.xyzw for unused src sRegs.src1.num=0; sRegs.src1.mod=0; sRegs.src1.swz=0x1b; sRegs.src1.idx=0; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode(log): Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_ADD<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; } p+=4; break; case 0x00000011: //dst dest,src0,src1 (calculates distance) case 0x40000011: //+dst pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; pb_read_pregs(pcode,&sRegs,3); pcode+=3; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_DST<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; } p+=4; break; case 0x00000014: //m4x4 dest, src0, ?i (matrix multiply) case 0x40000014: //+m4x4 //dp4 dest.x, src0, ?i pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; if (n==136) { debugPrint("pb_pcode2mcode(m4x4): Too many instructions: max=136 (including expanded macros)\n"); return NULL; } pb_read_pregs(pcode,&sRegs,3); pcode+=3; if ( (sRegs.src0.swz!=0x1b)|| (sRegs.src1.swz!=0x1b)|| (sRegs.src0.mod)|| (sRegs.src1.mod) ) { debugPrint("pb_pcode2mcode: Modifiers or swizles not allowed in matrices multiplication macros\n"); return NULL; } sRegs.dest.msk=8; //.x if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_DP4<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; } p+=4; break; case 0x0000004f: //logp dest,src0 (scalar partial precision logarithm function) case 0x4000004f: //+logp pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; pb_read_pregs(pcode,&sRegs,2); pcode+=2; //src2 is used instead of src0 in scalar functions sRegs.n=4; sRegs.src2=sRegs.src0; sRegs.src0.reg=9; //v0.xyzw for unused src sRegs.src0.num=0; sRegs.src0.mod=0; sRegs.src0.swz=0x1b; sRegs.src0.idx=0; sRegs.src1.reg=9; //v0.xyzw for unused src sRegs.src1.num=0; sRegs.src1.mod=0; sRegs.src1.swz=0x1b; sRegs.src1.idx=0; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_LOG<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; } p+=4; break; case 0x00000051: //def cn x, y, z, w or def cn r, g, b, a pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_CONST_ID,1); *(p++)=((*(pcode++))&0xff)+96; //set cursor in order to load data into Cn pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_CONST_X,4); *(p++)=*(pcode++); *(p++)=*(pcode++); *(p++)=*(pcode++); *(p++)=*(pcode++); break; //non standard pseudo-code: nvidia-specific (vsa.exe won't accept these assembler instructions) //workaround : use dp4 and rcp, then, in pseudo code, replace 9 with 0x100 and 6 with 0x101 case 0x00000100: //dph dest,src0,src1 (homogeneous dot product: same as dp4 but src0.w is seen as 1.0f) case 0x40000100: //+dph pb_push(p++,NV20_TCL_PRIMITIVE_3D_VP_UPLOAD_INST0,4); n++; pb_read_pregs(pcode,&sRegs,3); pcode+=3; if (pb_set_mcode(p,&sRegs)) { debugPrint("pb_pcode2mcode: Unrecognized token\n"); return NULL; } *(p+1)|=NV20_VP_INST_OPCODE_DPH<>(NV20_VP_INST_VTEMP_WRITEMASK_SHIFT-NV20_VP_INST_STEMP_WRITEMASK_SHIFT); *(p+3)&=~NV20_VP_INST_VTEMP_WRITEMASK_MASK; } p+=4; break; default: debugPrint("pb_pcode2mcode: Unrecognized vs token #%08x\n",*(pcode-1)); return NULL; } } *(p-1)|=NV20_VP_INST_LAST_INST; //bit 0 of 4th dword means end of shader pb_gpu_programnc[0]=p-&pb_gpu_programnc[1]; //size pb_gpu_programnc[0]|=0x43210000; //personal vs marker return &pb_gpu_programnc[0]; }