02_RotoMonkey - Wokwi ESP32, STM32, Arduino Simulator

//============================================================================
// 02_RotoMonkey - Rotating monkey head example
//
// Copyright (c) 2022, Jarkko Lempiainen
// All rights reserved.
//----------------------------------------------------------------------------
// This example shows the basics of rendering 3D objects with EmberGL. There
// can be quite a bit of concepts to digest, particularly if you are not already
// familiar with 3D rendering and other graphics APIs. However, just getting the
// example running and starting to play around with different parameters and
// modify things is a good way to get a good grasp on how it all work.
//
// From this example you will learn how to...
// - Configure the rasterizer for the graphics device
// - Convert OBJ/FBX/etc. 3D objects to P3G and import to your program
// - Initialize the 3D mesh and select proper run-time vertex format
// - Write simple vertex and pixel shaders
// - Basic 3D camera and object transformation setup
// - Setup PSO for rendering and dispatch to graphics device
//============================================================================

#include "egl_device_lib.h"
#include "egl_mesh.h"
#include "egl_vertex.h"
EGL_USING_NAMESPACE
#ifndef SAMPLE_DEVICE_DEFINED
typedef graphics_device_ili9341 gfx_device_t;
#endif
//----------------------------------------------------------------------------


//============================================================================
// 3D geometry
//============================================================================
// Here we define a 3D geometry P3G asset in an array s_p3g_monkey_data, where
// the mesh data resides in "p3g_monkey.h" file, that's generated with
// "Meshlete" tool. The tool converts OBJ/FBX/DAE/etc. mesh files to P3G format
// using required vertex format and other parameters. Note that P3G files
// contain only 3D geometry data and no materials/textures, which needs to be
// handled separately.
// This specific file was generated with the following command line:
//   meshlete -i monkey.dae -o p3g_monkey.h -hexd -vf p48n32 -mb -mc
// The used "monkey.dae" (Collada) file can be found in the 02_RotoMonkey
// example dir. There are couple of other models as well "p3g_cube.h" and
// "p3g_torus.h" you can try out, and the default monkey 3D model could be too
// heavy for some MCUs.
//
// The tool splits 3D geometry to geometry "clusters", where each cluster
// contains some small number of triangles and vertices. By default the tool
// uses <=64 vertices and <=128 triangles per cluster but these defaults can
// be overridden with "-mv" and "-mt" switches respectively. The clustered
// geometry processing enables various optimizations in the rasterizer which
// is why this process is done.
// 
// The "-vf" switch specifies the vertex format used for the geometry. For all
// P3G files in this example we use "p48n32" format, which means 48-bit position
// and 32-bit normal data, which adds up to 10 bytes per vertex. The used vertex
// format for .h file can be found in the comment of the file. Depending on the
// use-case you may want to use different format, for example to add texture
// coordinates, colors or tangent space. The available default vertex formats
// can be found in "vfmt.xml" file of Meshlete tool, but you can add your own
// formats as well (either to this file, or to separate file that you can tell
// the tool to use with "-vc" switch). The used vertex format in P3G must match
// the format used by the vertex shader that reads the data. For the default
// vertex formats you can find the accompanying C++ vertex types in egl_vertex.h
// file, so for this choise we define the used vertex type as vertex_p48n32 type.
//
// The "-mb" switch tells the tool to calculate bounding volumes for each
// cluster. This switch should be always used, because the rasterizer uses the
// volumes for tile binning and occlusion testing, which are improtant for
// performance. You can see in "Options" comment in the generated .h file if
// this switch was used.
//
// The "-mc" switch tell the tool to generate "visibility cones" for the
// clusters, which are used for run-time view (camera) dependent culling of the
// geometry to further optimize performance. Visibility cones define conical
// regions in space for each cluster where the cluster is visible and if the
// view isn't in this region, the cluster is culled early in the rendering
// pipeline improving the performance. For simple geometry it's not worth to use
// -mc (it's not used for p3g_cube or p3g_torus) because all clusters are
// visible all around the object, but this can save quite a bit of processing
// for more complex geometry. The memory and performance overhead of the
// visibility cones is quite small though, so it's better to use the switch in
// doubt.
static const EGL_ALIGN(4) uint32_t PROGMEM s_p3g_monkey_data[]=
{
  #include "p3g_monkey.h"
};
static p3g_mesh s_p3g_monkey;
typedef vertex_p48n32 vtx_monkey_t;
//----------------------------------------------------------------------------


//============================================================================
// rasterizer_memory_cfg
//============================================================================
// This config struct defines the global rasterizer settings for buffers that
// are needed by the rasterizer. The rasterizer doesn't allocate any memory
// dynamically but all the buffers are allocated from static/global memory with
// this struct.
//
// The struct should be derived from rasterizer_memory_cfg_base, which defines
// the default rasterizer settings, but they can be individually overridden in
// the struct. While the default settings would work for this example, we orride
// them all to explain their meaning in more details. It can be quite overwhelming
// to understand all the settings at this point because it requires deeper
// understanding of the rasterizer, so it's probably better to just skim them
// through and return back to them later.
//
// Allocating too small buffers results in run-time error with an error message
// for debug and release builds, while for retail it just crashes because buffer
// overflows, so it's good to use debug/release while developing. To optimize
// the buffers and RAM usage it's good to first allocate larger buffers and then
// use egl_device::::log_rasterizer_stats() to print out how much of the buffers
// were actually used, and then trim down the buffers using the provided numbers.
struct rasterizer_memory_cfg: rasterizer_memory_cfg_base
{
  // The value below defines the used depth buffer format. For this example we
  // use 16bpp depth buffer, which is enough depth precision in this case. You
  // can also use 8bpp format or 32bpp format depending on the depth precision
  // requirements.
  enum {depth_format=depthfmt_uint16};

  // Hi-Z is a low-resolution representation of the depth buffer to enable fast
  // region depth testing. This can be used for example for conservative cluster
  // occlusion culling to skip processing clusters if they are completely
  // occluded by geometry rasterized earlier in the frame. Because of this it's
  // good to rasterize opaque geometry in the order of increasing distance from
  // camera to increase the occlusion opportunities. There is a cost associated
  // in both updating and testing against Hi-Z (and some minor memory overhead),
  // but usually it's good to have Hi-Z enabled.
  enum {depth_hiz=true};

  // Here we specify the intermediate tile pixel format to be the same as the
  // native display pixel format. Everything is first rendered to the tile
  // buffer of this format that resides in RAM and after the tile has been
  // completed, the pixels are converted and transferred to the native display
  // frame buffer. Why would you use anything but the native format you ask?
  // You could do frame buffer blending (e.g. alpha or additive) which might
  // require higher precision than the native format, or want to perform some
  // operations with the rendered result (e.g. deferred lighting) using tile
  // shaders once the tile rendering is complete. Also for plain grayscale
  // rendering 8bpp pixel format could be used to really squeeze the RAM usage
  // and expand the final result to the native format upon pixel transfer.
  // You can also use up to 4 multiple render targets (MRTs) by defining the
  // format rtile*_fmt for each. By default the other render tile formats are
  // set to none and thus don't allocate memory.
  enum {rt0_fmt=gfx_device_t::fb_format};

  // The value below defines how many PSO dispatches can be done between in a
  // frame (between commit() calls). In this example we render only a single
  // object with single material per frame so this is all we need. For better
  // explanation about PSO's check out README.md. This value must be at least
  // "Max dispatches" reported by the rasterizer stats.
  enum {max_dispatches=1};

  // This defines how much memory is allocated for all the dispatched PSO's
  // between commits. The size depends how big PSO's are used to render the
  // frame. This value must be at least "Max PSO store" reported by the
  // rasterizer stats.
  enum {pso_store_size=128};

  // This value defines the maximum memory required by a PSO state. This buffer
  // is allocated temporarily from the stack and setting this value to the
  // smallest required value reduces the stack usage. This value must be at
  // least "Max PSO state" reported by the rasterizer stats.
  enum {max_pso_state_size=128};

  // The frame is rendered in tiles and here We can define the tile size.
  // Larger tiles require larger memory buffer for intermediate results, while
  // for smaller tiles a single cluster needs to be binned to more tiles and
  // thus use more memory for the binning. From the performance point of view
  // larger tiles require processing more clusters per tile, while with smaller
  // tiles there are more tiles to process in total. It's really a balancing
  // act which tile size is the best fit, but this is a good starting point.
  enum {tile_width=64, tile_height=64};

  // This defines the order in which the tiles are rendered. Below we specify
  // Morton order which walks through the tiles in more local manner. The main
  // benefit is that the post-transform vertex cache is likely better utilized
  // because the same cluster is likely processed for consecutively rendered
  // tiles, thus reducing the vertex transform cost.
  enum {tile_order=tileorder_morton};

  // This is the maximum number of total clusters binned to tiles between
  // commits. Note that if a cluster is culled before binning (e.g. with
  // v-cones, or because it's out of screen) it doesn't count towards the
  // total cluster count. This value must be at least "Max clusters" reported
  // by the rasterizer stats.
  enum {max_clusters=256};

  // For binning clusters to tiles we build a linked list of cluster index
  // strips for each tile that holds indices to the clusters binned to the
  // tile. Each strip can contain some number of cluster indices. Depending
  // how the clusters are binned we need different amount of strips and for
  // the same object this varies depending how the object is rendered.
  // It's not clear yet what's the best way to adjust this value but it's
  // better to be quite conservative of the reported value. This value must
  // be at least "Max cluster strips" reported by the rasterizer stats.
  enum {max_cluster_strips=256};

  // This is the size of a temporal post-transform vertex buffer (PTV buffer)
  // to hold the vertex transform result of a single cluster for the PSC
  // tform_cluster() function. The required size depends on how many vertices
  // are in the clusters and what's the size of the PTVs (ptv_fmt) of the used
  // PSCs. Because for P3G generation we use <=64 vertices in a cluster and the
  // sizeof(ps_simple_ndotl::psin) is 20, we use value 64*20. This value must
  // be at least "Max PTV buffer size" reported by the rasterizer stats.
  enum {max_vout_size=64*20};

  // This is the size of the PTV cache (or v-cache for short). The cache is
  // used to reduce cluster vertex transform cost across tiles, i.e. if the
  // same cluster is binned to multiple tiles, the transform result could be
  // stored in the cache and reused when the other tiles are rendered. Here
  // we just disable the v-cache by setting the size to 0. There is no minimum
  // value but a larger v-cache reduces vertex transforms.
  enum {vcache_size=0};

  // The below value defines how many tiles worth of memory we want to allocate
  // for DMA transfers. The DMA buffers are allocated in native pixel format.
  // If tiles are rasterized faster than DMA has time to transfer them to the
  // display they are queued to the buffers for transfer. Note that some tiles
  // may be faster to rasterize than others so using more buffers can help to
  // utilize wait times of faster tiles to rasterize slower tiles. Note that if
  // the device doesn't support DMA no buffers are allocated regardless of this
  // value.
  enum {max_dma_transfers=0};
};
//----------------------------------------------------------------------------


//============================================================================
// setup
//============================================================================
static gfx_device_t s_gfx_device;
static rasterizer_memory<gfx_device_t, rasterizer_memory_cfg> s_gfx_device_mem;
static cameraf s_camera;
static vec3f s_light_dir;
//----

void setup()
{
  // Init serial and graphics device
  init_serial();
#ifndef SAMPLE_DEVICE_DEFINED
  s_gfx_device.init(10, 9, 13, 11, 12, 8);
#endif
  s_gfx_device_mem.init(s_gfx_device);

  // After the initialization we need to clear the content of the depth buffer.
  // It depends on the case to which value you should clear the depth. Here we
  // clear the depth to max depth value because we use regular depth buffer
  // where smaller values mean close to the camera. However, for reverse depth
  // buffer rendering you should clear the depth to min instead.
  // Once the depth is cleared it doesn't need to be cleared again for every
  // frame, because the rasterizer keeps automatically clearing it to the set
  // value after each tile.
  s_gfx_device.clear_depth(cleardepth_max);

  // Initialize the P3G mesh with the geometry data defined earlier. It's
  // good to initialize the mesh after init_serial() because in case of errors
  // we can see the error messages in the serial monitor. After initialization
  // the geometry is ready for rendering.
  s_p3g_monkey.init(s_p3g_monkey_data);

  // Setup camera projection with camera FoV, aspect ratio and near & far planes.
  // This is usually one-time setup after which only the camera position and
  // orientation are animated.
  float cam_fov=60.0f; // the FOV in degrees
  float cam_aspect_ratio=float(gfx_device_t::fb_width)/float(gfx_device_t::fb_height);
  float cam_near_plane=0.1f;
  float cam_far_plane=100.0f;
  mat44f view2proj=perspective_matrix<float>(cam_fov*mathf::deg_to_rad, cam_aspect_ratio, cam_near_plane, cam_far_plane);
  s_camera.set_view_to_proj(view2proj, cam_near_plane, cam_far_plane);

  // Because we don't move the camera in this example, we can setup the camera
  // transform in the setup() as well. The below code places the camera to
  // [0, 0, 0] and sets the rotation to identity matrix (X=right, Y=up, Z=forward)
  vec3f cam_pos(0.0f, 0.0f, 0.0f);
  mat33f cam_rot(1.0f, 0.0f, 0.0f,
                 0.0f, 1.0f, 0.0f,
                 0.0f, 0.0f, 1.0f);
  s_camera.set_view_to_world(tform3f(cam_rot, cam_pos));

  // Lets setup the light direction (unit length in world space)
  s_light_dir=unit(vec3f(1.0f, 1.0f, -1.0f));
}
//----------------------------------------------------------------------------


//============================================================================
// vs_simple_ndotl
//============================================================================
// This defines a vertex shader used for rendering the geometry. This vertex
// shader implements the interface expected by psc_p3g_mesh and in theory you
// could have any kind of shader interface for custom PSCs. The rasterizer
// dictates only the PSC interface as defined by rasterizer_psc_base but how
// those requirements are "forwarded" is up to the PSC.
// That said, it's good to standardize the shader interface across PSCs so that
// shaders can be more freely shared across different PSCs.
struct vs_simple_ndotl
{
  //==========================================================================
  // vsout
  //==========================================================================
  // This struct defines the vertex attributes outputted by the vertex shader,
  // except for vertex position. 
  struct vsout
  {
    float ndotl;
  };
  //--------------------------------------------------------------------------

  template<class PSIn, class PSOState, class VIn>
  EGL_INLINE void exec(PSIn &psin_, const PSOState &pso_state_, const typename VIn::transform_state &tstate_, const VIn &vin_, uint8_t) const
  {
    // Get position attribute from the input vertex. All the vertex attributes
    // should be fetched with get_*() functions so that the data can be
    // transparently transformed. For example the position data in input vertex
    // could be stored as vec3f, or it could be stored as quantized vec3s in
    // object bounds and get_pos() would dequantize the position. From the
    // shader code point of view there is no difference thus the shader is
    // compatible with any vertex input which provides the data, regardless of
    // the format.
    vec3f pos=get_pos(vin_, tstate_);

    // Transform the object space position from object space to projection/clip
    // space. Note that the matrix convention is row-major and we multiply from
    // left-to-right order. The rasterizer expects the pos value to be in
    // projection/clip space like with other graphics APIs.
    psin_.pos=vec4f(pos, 1.0f)*pso_state_.obj_to_proj;

    // Let's also fetch normal from the input vertex. Just like for position,
    // we use a get-function to fetch the data to allow transform of the data
    // if necessary.
    vec3f normal=get_normal(vin_, tstate_);

    // For vertex output we fill the vsout struct defined above. We could write
    // the attributes directly psin_ like the position, but there's a reason
    // for this indirection: Vertex shader may output more data than expected
    // by the pixel shader, which would result in compilation error. By using
    // this indirection we can avoid the error and the unnecessary data and
    // related calculations will be optimized out by the C++ compiler (due to
    // having no side-effects). This way we expand the combinatory power of the
    // shaders without sacrificing performance.
    // The indirection doesn't make a difference in this example because the
    // shaders are limited to the scope of this example, but for building a
    // shader library it's very useful.
    // Here we output single attribute "ndotl", which is the dot product
    // between the object space vertex normal and light direction in object
    // space.
    vsout vo;
    vo.ndotl=dot(normal, light_obj_dir);
    psin_.set_attribs(vo);
  }
  //--------------------------------------------------------------------------

  vec3f light_obj_dir;
};
//----------------------------------------------------------------------------


//============================================================================
// ps_simple_ndotl
//============================================================================
struct ps_simple_ndotl
{
  //==========================================================================
  // psin
  //==========================================================================
  // This struct defines the actual post-transform vertex type. The only
  // mandatory attribute is "vec4f pos" that's expected by the rasterizer.
  // Vertex shader must write clip/projection space vertex coordinates to pos,
  // which the  rasterizer transforms to normalized device coordinates (NDC).
  // The rest of the attributes can be freely defined.
  // Here we expect the associated vertex shader to provide argument "ndotl".
  // Violating the requirement results in compilation error. If the vertex
  // shader provides other attributes, they are omitted by the set_attribs()
  // function and eliminated by the compiler.
  struct psin
  {
    vec4f pos;
    float ndotl;
    //------------------------------------------------------------------------

    template<class VSOut>
    EGL_INLINE void set_attribs(const VSOut &vo_)
    {
      ndotl=vo_.ndotl;
    }
  };
  //--------------------------------------------------------------------------

  template<class PSC>
  EGL_INLINE void exec(rasterizer_pixel_out<PSC> &psout_, const typename PSC::pso_state&, const psin &v0_, const psin &v1_, const psin &v2_, const vec3f &bc_, uint8_t) const
  {
    // The following line performs vertex attribute linear interpolation on the
    // rasterized triangle using the passed barycentric coordinates. This same
    // interpolation code can be used for any vertex attribute (colors, texture
    // coordinates, tangent space, etc.)
    float ndotl=v0_.ndotl*bc_.x+v1_.ndotl*bc_.y+v2_.ndotl*bc_.z;

    // Export the result to the render target 0. We use abs() here to have
    // double-sided lighting and multiply the result with object color.
    psout_.export_rt0(color*abs(ndotl));
  }
  //--------------------------------------------------------------------------

  color_rgbaf color;
};
//----------------------------------------------------------------------------


//============================================================================
// rstate_opaque
//============================================================================
// This struct defines the render state for opaque rendering we can use in
// this example. The default render state is fine for this, except we need
// to override the rt0_fmt and depth_format to match the format we chose in
// the rasterizer_memory_cfg.
struct rstate_opaque: rasterizer_psc_base
{
  enum {rt0_fmt=rasterizer_memory_cfg::rt0_fmt};
  enum {depth_format=rasterizer_memory_cfg::depth_format};
};
//----------------------------------------------------------------------------


//============================================================================
// loop
//============================================================================
void loop()
{
  // Here we setup the object transformation. The position of the object is
  // [0, 0, 3], i.e. in the front of the camera that we set up in the setup()
  // function. For the rotation we use two angles s_rot_x and s_rot_y that we
  // animate, and setup 3x3 rotation matrix with the angles. Note that the
  // angles are in radians not degrees.
  static float s_rot_x=-1.2f, s_rot_y=-0.5f;
  vec3f obj_pos(0.0f, 0.0f, 3.0f);
  mat33f obj_rot;
  set_rotation_xyz(obj_rot, s_rot_x, s_rot_y, 0.0f);
  s_rot_x+=0.0123f;
  s_rot_y+=0.0321f;

  // Here we create a PSO by instantiating psc_p3g_mesh class for vtx_monkey_t
  // input vertex format, and the above vertex and pixel shaders. It also uses
  // the previously defined rstate_opaque render state struct to setup the
  // rendering pipeline for opaque rendering for this PSO.
  psc_p3g_mesh<rstate_opaque, vtx_monkey_t, vs_simple_ndotl, ps_simple_ndotl> pso;

  // Lets now setup the object transform to the PSO. psc_p3g_mesh provides
  // set_transform() function to setup all the necessary matrices needed to
  // render the mesh. Besides transforming the vertices in the vertex shader
  // with the matrix, psc_p3g_mesh must also handle cluster culling and binning
  // to tiles. Otherwise we could just have the obj->proj matrix in the vertex
  // shader.
  pso.set_transform(s_camera, tform3f(obj_rot, obj_pos));

  // To setup the vertex shader we need to setup the light direction in object
  // space. To properly rotate a vector with a 3x3 matrix we need to multiply
  // the vector by inverse-transpose of the rotation matrix. We have obj_rot
  // which transform from object->world space, but we need world->object space
  // instead to transform the light direction in world space to the object
  // space, which means we need to invert obj_rot matrix. However, this means
  // we need to invert the matrix twice which cancels out, so we only need to
  // transpose the matrix instead. However, multiplying the vector from the
  // left instead of right does the trick without the transpose.
  pso.vshader().light_obj_dir=unit(obj_rot*s_light_dir);

  // Let's set a light cyan color of the object
  pso.pshader().color.set(0.5f, 1.0f, 1.0f);

  // Here we setup the P3G geometry to render. The second parameter defines
  // the geometry "segment" index, which can be used to render objects with
  // multiple materials. The P3G files in this example have only single
  // segment so we just use index 0, and dispatch only one PSO for the mesh.
  // For multi-segment objects we could simply loop over the segments
  // for(uint16_t seg_idx=0; seg_idx<s_p3g_monkey.num_segments(); ++seg_idx)
  pso.set_geometry(s_p3g_monkey, 0);

  // And finally we can dispatch the PSO for rasterization. The dispatch
  // doesn't yet actually rasterize the geometry, but only bins the clusters
  // to the tiles for rasterization. The rasterization is deferred until call
  // to commit(), which does rasterize all the tiles and submit the rasterized
  // tiles to the display frame buffer.
  // So for the frame we first call dispatch_pso() for all the objects we want
  // to render and then at the end of the frame call commit()
  s_gfx_device.dispatch_pso(pso);

  // And there you go, we commit the dispatched PSOs for rasterization. It's
  // also possible to pass "tile shader" to the commit, which can be used to
  // to perform custom tile=>frame buffer pixel transforms.
  s_gfx_device.commit();

  // Here we print out the rasterizer statistics to the log every 100 frames,
  // which can be used to trim the rasterizer memory configuration values.
  // Remember to enable either EGL_DEBUG or EGL_RELEASE build in egl_core.h to
  // have the statistics gathered. The statistics are disabled in EGL_RETAIL
  // to optimize performance.
  static unsigned s_log_counter=0;
  if(s_log_counter==0)
  {
    s_gfx_device.log_rasterizer_stats();
    s_log_counter=100;
  }
}
//----------------------------------------------------------------------------