diff --git a/stb_voxel_render.h b/stb_voxel_render.h index 9654713..0b5728a 100644 --- a/stb_voxel_render.h +++ b/stb_voxel_render.h @@ -341,16 +341,22 @@ enum enum { - STBVOX_TEXLERP4_0_8, - STBVOX_TEXLERP4_1_8, - STBVOX_TEXLERP4_2_8, - STBVOX_TEXLERP4_3_8, - STBVOX_TEXLERP4_4_8, - STBVOX_TEXLERP4_5_8, - STBVOX_TEXLERP4_6_8, - STBVOX_TEXLERP4_7_8, + STBVOX_TEXLERP_BASE_0, // 0.0 + STBVOX_TEXLERP_BASE_2_7, // 2/7 + STBVOX_TEXLERP_BASE_5_7, // 4/7 + STBVOX_TEXLERP_BASE_1 // 1.0 +}; - STBVOX_TEXLERP4_use_vert=15, +enum +{ + STBVOX_TEXLERP3_0_8, + STBVOX_TEXLERP3_1_8, + STBVOX_TEXLERP3_2_8, + STBVOX_TEXLERP3_3_8, + STBVOX_TEXLERP3_4_8, + STBVOX_TEXLERP3_5_8, + STBVOX_TEXLERP3_6_8, + STBVOX_TEXLERP3_7_8, }; enum @@ -365,6 +371,7 @@ enum STBVOX_FACE__count, }; +#define STBVOX_FACE_NONE 7 #define STBVOX_BLOCKTYPE_EMPTY 0 @@ -379,6 +386,7 @@ enum #define STBVOX_MAKE_MATROT(block, overlay, tex2, color) ((block) + (overlay)*4 + (tex2)*16 + (color)*64) #define STBVOX_MAKE_TEX2_REPLACE(tex2, tex2_replace_face) ((tex2) + ((tex2_replace_face) & 3)*64) #define STBVOX_MAKE_TEXLERP(ns2, ew2, ud2, vert) ((ew2) + (ns2)*4 + (ud2)*16 + (vert)*64) +#define STBVOX_MAKE_TEXLERP_SIMPLE(baselerp,vert,face) ((vert)*32 + (face)*4 + (baselerp)) #define STBVOX_MAKE_TEXLERP1(vert,e2,n2,w2,s2,u4,d2) STBVOX_MAKE_TEXLERP(s2, w2, d2, vert) #define STBVOX_MAKE_TEXLERP2(vert,e2,n2,w2,s2,u4,d2) ((u2)*16 + (n2)*4 + (s2)) #define STBVOX_MAKE_FACE_MASK(e,n,w,s,u,d) ((e)+(n)*2+(w)*4+(s)*8+(u)*16+(d)*32) @@ -419,6 +427,7 @@ struct stbvox_input_description unsigned char *vheight; // STBVOX_MAKE_VHEIGHT -- sw:2, se:2, nw:2, ne:2, doesn't rotate unsigned char *texlerp; // STBVOX_MAKE_TEXLERP -- vert:2, ud:2, ew:2, ns:2 unsigned char *texlerp2; // STBVOX_MAKE_TEXLERP2 (and use STBVOX_MAKE_TEXLERP1 for 'texlerp' -- e:2, n:2, u:3, unused:1 + unsigned char *texlerp_simple; // STBVOX_MAKE_TEXLERP_SIMPLE -- baselerp:2, vert_lerp:3, face_to_use_vert_lerp:3 unsigned short *texlerp_vert3; // e:3,n:3,w:3,s:3,u:3 (down comes from 'texlerp') unsigned short *texlerp_face3; // e:3,n:3,w:3,s:3,u:2,d:2 unsigned char *lighting; // lighting:8 @@ -944,7 +953,7 @@ static char *stbvox_fragment_program = #endif " if (texblend_mode)\n" - " albedo = tex1.xyz * rlerp(tex2.a, 2.0*tex2.xyz, vec3(1.0,1.0,1.0));\n" + " albedo = tex1.xyz * rlerp(tex2.a, vec3(1.0,1.0,1.0), 2.0*tex2.xyz);\n" " else {\n" #ifdef STBVOX_CONFIG_PREMULTIPLIED_ALPHA " albedo = (1.0-tex2.a)*tex1.xyz + tex2.xyz;\n" @@ -1314,10 +1323,10 @@ stbvox_mesh_face stbvox_compute_mesh_face_value(stbvox_mesh_maker *mm, stbvox_ro static unsigned char stbvox_face_lerp[6] = { 0,2,0,2,4,4 }; static unsigned char stbvox_vert3_lerp[6] = { 0,3,6,9,12,12 }; -static unsigned char stbvox_vert_lerp_for_face_lerp[6] = { 0, 4, 7 }; +static unsigned char stbvox_vert_lerp_for_face_lerp[4] = { 0, 4, 7, 7 }; static unsigned char stbvox_face3_lerp[6] = { 0,3,6,9,12,14 }; static unsigned char stbvox_face3_updown[8] = { 0,2,4,7,0,2,4,7 }; - +static unsigned char stbvox_vert_lerp_for_simple[4] = { 0,2,5,7 }; // vertex offsets for face vertices static unsigned char stbvox_vertex_vector[6][4][3]; static stbvox_mesh_vertex stbvox_vmesh_delta_normal[6][4]; @@ -1376,6 +1385,22 @@ void stbvox_make_mesh_for_face(stbvox_mesh_maker *mm, stbvox_rotate rot, int fac if (face >= 4) val = stbvox_face3_updown[val]; p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,val); + } else if (mm->input.texlerp_simple) { + unsigned char val = mm->input.texlerp_simple[v_off]; + unsigned char lerp_face = (val >> 2) & 7; + if (lerp_face == face) { + p1[0] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][0]] >> 5) & 7; + p1[1] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][1]] >> 5) & 7; + p1[2] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][2]] >> 5) & 7; + p1[3] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][3]] >> 5) & 7; + p1[0] = stbvox_vertex_encode(0,0,0,0,p1[0]); + p1[1] = stbvox_vertex_encode(0,0,0,0,p1[1]); + p1[2] = stbvox_vertex_encode(0,0,0,0,p1[2]); + p1[3] = stbvox_vertex_encode(0,0,0,0,p1[3]); + } else { + unsigned char base = stbvox_vert_lerp_for_simple[val&3]; + p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,base); + } } else if (mm->input.texlerp) { unsigned char facelerp = (mm->input.texlerp[v_off] >> stbvox_face_lerp[face]) & 3; if (facelerp == STBVOX_TEXLERP_use_vert) { @@ -1407,6 +1432,7 @@ void stbvox_make_mesh_for_face(stbvox_mesh_maker *mm, stbvox_rotate rot, int fac stbvox_get_quad_vertex_pointer(mm, mesh, mv, face_data); if (mm->input.lighting) { + // @TODO: lighting at block centers, but not gathered, instead constant-per-face if (mm->input.lighting_at_vertices) { int i; for (i=0; i < 4; ++i) { @@ -2128,7 +2154,7 @@ int stbvox_make_mesh(stbvox_mesh_maker *mm) int x,y; stbvox_bring_up_to_date(mm); mm->full = 0; - if (mm->cur_x || mm->cur_y || mm->cur_z) { + if (mm->cur_x > mm->x0 || mm->cur_y > mm->y0 || mm->cur_z > mm->z0) { stbvox_make_mesh_for_column(mm, mm->cur_x, mm->cur_y, mm->cur_z); if (mm->full) return 0; @@ -2139,8 +2165,9 @@ int stbvox_make_mesh(stbvox_mesh_maker *mm) return 0; ++mm->cur_y; } + ++mm->cur_x; } - for (x=mm->x0; x < mm->x1; ++x) { + for (x=mm->cur_x; x < mm->x1; ++x) { for (y=mm->y0; y < mm->y1; ++y) { stbvox_make_mesh_for_column(mm, x, y, mm->z0); if (mm->full) { @@ -2805,8 +2832,6 @@ static stbvox_face_up_normal_012[4][4][4] = { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_u , }, } }; -// 013[3][3][1] -// 023[3][1][1] static stbvox_face_up_normal_013[4][4][4] = { diff --git a/tests/caveview/cave_main.c b/tests/caveview/cave_main.c index 6b3a047..4670123 100644 --- a/tests/caveview/cave_main.c +++ b/tests/caveview/cave_main.c @@ -341,10 +341,12 @@ int screen_x,screen_y; float carried_dt = 0; #define TICKRATE 60 +float tex2_alpha = 1.0; int raw_level_time; float global_timer; +int global_hack; int loopmode(float dt, int real, int in_client) { @@ -360,6 +362,11 @@ int loopmode(float dt, int real, int in_client) carried_dt += dt; while (carried_dt > 1.0/TICKRATE) { + if (global_hack) { + tex2_alpha += global_hack / 60.0f; + if (tex2_alpha < 0) tex2_alpha = 0; + if (tex2_alpha > 1) tex2_alpha = 1; + } //update_input(); // if the player is dead, stop the sim carried_dt -= 1.0/TICKRATE; @@ -432,6 +439,8 @@ void process_event(SDL_Event *e) if (s == SDL_SCANCODE_LCTRL) active_control_set(5); if (s == SDL_SCANCODE_S) active_control_set(6); if (s == SDL_SCANCODE_D) active_control_set(7); + if (k == '1') global_hack = !global_hack; + if (k == '2') global_hack = -1; #if 0 if (game_mode == GAME_editor) { @@ -568,7 +577,7 @@ int SDL_main(int argc, char **argv) } #endif - SDL_GL_SetSwapInterval(0); // only when profiling + SDL_GL_SetSwapInterval(1); render_init(); mesh_init(); diff --git a/tests/caveview/cave_mesher.c b/tests/caveview/cave_mesher.c index 7731c70..a22215b 100644 --- a/tests/caveview/cave_mesher.c +++ b/tests/caveview/cave_mesher.c @@ -32,11 +32,11 @@ extern void ods(char *fmt, ...); -#define FANCY_LEAVES // nearly 2x the triangles when enabled (if underground is filled) +//#define FANCY_LEAVES // nearly 2x the triangles when enabled (if underground is filled) #define FAST_CHUNK #define IN_PLACE -#define SKIP_TERRAIN 0 // use to avoid building underground stuff +#define SKIP_TERRAIN 48 // use to avoid building underground stuff // allows you to see what perf would be like if underground was efficiently culled, // or if you were making a game without underground @@ -594,6 +594,7 @@ void make_map_segment_for_superchunk_preconvert(int chunk_x, int chunk_y, int se } } +// build 1 mesh covering 2x2 chunks void build_chunk(int chunk_x, int chunk_y, fast_chunk *fc_table[4][4], raw_mesh *rm) { int a,b,z; @@ -603,6 +604,10 @@ void build_chunk(int chunk_x, int chunk_y, fast_chunk *fc_table[4][4], raw_mesh unsigned char vheight[34][34][18]; #endif + #ifndef STBVOX_CONFIG_DISABLE_TEX2 + unsigned char tex2_choice[34][34][18]; + #endif + assert((chunk_x & 1) == 0); assert((chunk_y & 1) == 0); @@ -618,7 +623,6 @@ void build_chunk(int chunk_x, int chunk_y, fast_chunk *fc_table[4][4], raw_mesh map->block_color_face = minecraft_color_for_blocktype; map->block_geometry = minecraft_geom_for_blocktype; - // we're going to build 4 meshes in parallel, each covering 2x2 chunks stbvox_reset_buffers(&rm->mm); stbvox_set_buffer(&rm->mm, 0, 0, rm->build_buffer, BUILD_BUFFER_SIZE); stbvox_set_buffer(&rm->mm, 0, 1, rm->face_buffer , FACE_BUFFER_SIZE); @@ -636,6 +640,25 @@ void build_chunk(int chunk_x, int chunk_y, fast_chunk *fc_table[4][4], raw_mesh } } + #ifndef STBVOX_CONFIG_DISABLE_TEX2 + for (a=0; a < 34; ++a) { + for (b=0; b < 34; ++b) { + int px = chunk_x*16 + a - 1; + int py = chunk_y*16 + b - 1; + float dist = (float) sqrt(px*px + py*py); + float s1 = (float) sin(dist / 16), s2, s3; + dist = (float) sqrt((px-80)*(px-80) + (py-50)*(py-50)); + s2 = (float) sin(dist / 11); + for (z=0; z < 18; ++z) { + s3 = (float) sin(z * 3.141592 / 8); + + s3 = s1*s2*s3; + tex2_choice[a][b][z] = 63 & (int) stb_linear_remap(s3,-1,1, -20,83); + } + } + } + #endif + for (z=256-16; z >= SKIP_TERRAIN; z -= 16) { int z0 = z; @@ -646,6 +669,9 @@ void build_chunk(int chunk_x, int chunk_y, fast_chunk *fc_table[4][4], raw_mesh map->blocktype = &rm->sv_blocktype[1][1][1-z]; // specify location of 0,0,0 so that accessing z0..z1 gets right data map->lighting = &rm->sv_lighting[1][1][1-z]; + #ifndef STBVOX_CONFIG_DISABLE_TEX2 + map->tex2 = &tex2_choice[1][1][1-z]; + #endif #ifdef VHEIGHT_TEST // hacky test of vheight @@ -844,3 +870,58 @@ void mesh_init(void) remap_in_place(54, 9); remap_in_place(146, 10); } + +// Timing stats while optimizing the single-threaded builder + +// 32..-32, 32..-32, SKIP_TERRAIN=0, !FANCY_LEAVES on 'mcrealm' data set + +// 6.27s - reblocked to do 16 z at a time instead of 256 (still using 66x66x258), 4 meshes in parallel +// 5.96s - reblocked to use FAST_CHUNK (no intermediate data structure) +// 5.45s - unknown change, or previous measurement was wrong + +// 6.12s - use preconverted data, not in-place +// 5.91s - use preconverted, in-place +// 5.34s - preconvert, in-place, avoid dependency chain (suggested by ryg) +// 5.34s - preconvert, in-place, avoid dependency chain, use bit-table instead of byte-table +// 5.50s - preconvert, in-place, branchless + +// 6.42s - non-preconvert, avoid dependency chain (not an error) +// 5.40s - non-preconvert, w/dependency chain (same as earlier) + +// 5.50s - non-FAST_CHUNK, reblocked outer loop for better cache reuse +// 4.73s - FAST_CHUNK non-preconvert, reblocked outer loop +// 4.25s - preconvert, in-place, reblocked outer loop +// 4.18s - preconvert, in-place, unrolled again +// 4.10s - 34x34 1 mesh instead of 66x66 and 4 meshes (will make it easier to do multiple threads) + +// 4.83s - building bitmasks but not using them (2 bits per block, one if empty, one if solid) + +// 5.16s - using empty bitmasks to early out +// 5.01s - using solid & empty bitmasks to early out - "foo" +// 4.64s - empty bitmask only, test 8 at a time, then test geom +// 4.72s - empty bitmask only, 8 at a time, then test bits +// 4.46s - split bitmask building into three loops (each byte is separate) +// 4.42s - further optimize computing bitmask + +// 4.58s - using solid & empty bitmasks to early out, same as "foo" but faster bitmask building +// 4.12s - using solid & empty bitmasks to efficiently test neighbors +// 4.04s - using 16-bit fetches (not endian-independent) +// - note this is first place that beats previous best '4.10s - 34x34 1 mesh' + +// 4.30s - current time with bitmasks disabled again (note was 4.10s earlier) +// 3.95s - bitmasks enabled again, no other changes +// 4.00s - current time with bitmasks disabled again, no other changes -- wide variation that is time dependent? +// (note that most of the numbers listed here are median of 3 values already) +// 3.98s - bitmasks enabled + +// Bitmasks removed from the code as not worth the complexity increase + + + +// Raw data for Q&A: +// +// 26% parsing & loading minecraft files (4/5ths of which is zlib decode) +// 39% building mesh from stb input format +// 18% converting from minecraft blocks to stb blocks +// 9% reordering from minecraft axis order to stb axis order +// 7% uploading vertex buffer to OpenGL diff --git a/tests/caveview/cave_render.c b/tests/caveview/cave_render.c index 7419f26..7ac96ec 100644 --- a/tests/caveview/cave_render.c +++ b/tests/caveview/cave_render.c @@ -364,6 +364,42 @@ int num_meshes_started; // stats int request_chunk(int chunk_x, int chunk_y); void update_meshes_from_render_thread(void); +unsigned char tex2_data[64][4]; + +void init_tex2_gradient(void) +{ + int i; + for (i=0; i < 16; ++i) { + tex2_data[i+ 0][0] = 64 + 12*i; + tex2_data[i+ 0][1] = 32; + tex2_data[i+ 0][2] = 64; + + tex2_data[i+16][0] = 255; + tex2_data[i+16][1] = 32 + 8*i; + tex2_data[i+16][2] = 64; + + tex2_data[i+32][0] = 255; + tex2_data[i+32][1] = 160; + tex2_data[i+32][2] = 64 + 12*i; + + tex2_data[i+48][0] = 255; + tex2_data[i+48][1] = 160 + 6*i; + tex2_data[i+48][2] = 255; + } +} + +void set_tex2_alpha(float fa) +{ + int i; + int a = (int) stb_lerp(fa, 0, 255); + if (a < 0) a = 0; else if (a > 255) a = 255; + glBindTexture(GL_TEXTURE_2D_ARRAY_EXT, voxel_tex[1]); + for (i=0; i < 64; ++i) { + tex2_data[i][3] = a; + glTexSubImage3DEXT(GL_TEXTURE_2D_ARRAY_EXT, 0, 0,0,i, 1,1,1, GL_RGBA, GL_UNSIGNED_BYTE, tex2_data[i]); + } +} + void render_init(void) { int i; @@ -424,64 +460,21 @@ void render_init(void) glBindTexture(GL_TEXTURE_2D_ARRAY_EXT, voxel_tex[1]); glTexImage3DEXT(GL_TEXTURE_2D_ARRAY_EXT, 0, GL_RGBA, - TEX_SIZE,TEX_SIZE,128, + 1,1,64, 0,GL_RGBA,GL_UNSIGNED_BYTE,NULL); + init_tex2_gradient(); + set_tex2_alpha(0.0); + #if 0 for (i=0; i < 128; ++i) { - build_overlay_texture(i); + //build_overlay_texture(i); glTexSubImage3DEXT(GL_TEXTURE_2D_ARRAY_EXT, 0, 0,0,i, TEX_SIZE,TEX_SIZE,1, GL_RGBA, GL_UNSIGNED_BYTE, texture[0]); } + #endif glTexParameteri(GL_TEXTURE_2D_ARRAY_EXT, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_LINEAR); glTexParameteri(GL_TEXTURE_2D_ARRAY_EXT, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glGenerateMipmapEXT(GL_TEXTURE_2D_ARRAY_EXT); } - -// Timing stats while optimizing the single-threaded builder - -// 32..-32, 32..-32, !FILL_TERRAIN, !FANCY_LEAVES on 'mcrealm' data set - -// 6.27s - reblocked to do 16 z at a time instead of 256 (still using 66x66x258), 4 meshes in parallel -// 5.96s - reblocked to use FAST_CHUNK (no intermediate data structure) -// 5.45s - unknown change, or previous measurement was wrong - -// 6.12s - use preconverted data, not in-place -// 5.91s - use preconverted, in-place -// 5.34s - preconvert, in-place, avoid dependency chain (suggested by ryg) -// 5.34s - preconvert, in-place, avoid dependency chain, use bit-table instead of byte-table -// 5.50s - preconvert, in-place, branchless - -// 6.42s - non-preconvert, avoid dependency chain (not an error) -// 5.40s - non-preconvert, w/dependency chain (same as earlier) - -// 5.50s - non-FAST_CHUNK, reblocked outer loop for better cache reuse -// 4.73s - FAST_CHUNK non-preconvert, reblocked outer loop -// 4.25s - preconvert, in-place, reblocked outer loop -// 4.18s - preconvert, in-place, unrolled again -// 4.10s - 34x34 1 mesh instead of 66x66 and 4 meshes (will make it easier to do multiple threads) - -// 4.83s - building bitmasks but not using them (2 bits per block, one if empty, one if solid) - -// 5.16s - using empty bitmasks to early out -// 5.01s - using solid & empty bitmasks to early out - "foo" -// 4.64s - empty bitmask only, test 8 at a time, then test geom -// 4.72s - empty bitmask only, 8 at a time, then test bits -// 4.46s - split bitmask building into three loops (each byte is separate) -// 4.42s - further optimize computing bitmask - -// 4.58s - using solid & empty bitmasks to early out, same as "foo" but faster bitmask building -// 4.12s - using solid & empty bitmasks to efficiently test neighbors -// 4.04s - using 16-bit fetches (not endian-independent) -// - note this is first place that beats previous best '4.10s - 34x34 1 mesh' - -// 4.30s - current time with bitmasks disabled again (note was 4.10s earlier) -// 3.95s - bitmasks enabled again, no other changes -// 4.00s - current time with bitmasks disabled again, no other changes -- wide variation that is time dependent? -// (note that most of the numbers listed here are median of 3 values already) -// 3.98s - bitmasks enabled - -// Bitmasks removed from the code as not worth the complexity increase - - void world_init(void) { int a,b,x,y; @@ -751,6 +744,8 @@ void update_meshes_from_render_thread(void) } } +extern float tex2_alpha; +extern int global_hack; int num_threads_active; float chunk_server_activity; @@ -790,6 +785,8 @@ void render_caves(float campos[3]) stbglUniform3fv(stbgl_find_uniform(main_prog, "light_source"), 2, lighting[0]); } + if (global_hack) + set_tex2_alpha(tex2_alpha); num_meshes_uploaded = 0; update_meshes_from_render_thread(); @@ -952,16 +949,3 @@ void render_caves(float campos[3]) num_threads_active += (mesh_data[i].state == WSTATE_running); } } - -// Raw data for Q&A: -// -// 26% parsing & loading minecraft files (4/5ths of which is zlib decode) -// 39% building mesh from stb input format -// 18% converting from minecraft blocks to stb blocks -// 9% reordering from minecraft axis order to stb axis order -// 7% uploading vertex buffer to OpenGL - - - - -