Re: Re: [PATCH 5/6] media: chips-media: wave5: refine SRAM usage
From: Ivan Bornyakov
Date: Thu Mar 21 2024 - 12:54:52 EST
On Thu, Mar 21, 2024 at 11:14:05AM -0500, Brandon Brnich wrote:
> Hi Ivan,
>
> On 13:52-20240321, Ivan Bornyakov wrote:
> > Hi!
> >
> > On Thu, Mar 21, 2024 at 09:29:04AM +0000, Nas Chung wrote:
> > > Hi, Ivan and Brandon.
> > >
> > > >-----Original Message-----
> > > >On 14:24-20240319, Ivan Bornyakov wrote:
> > > >> Hello, Nas
> > > >>
> > > >> On Tue, Mar 19, 2024 at 10:56:22AM +0000, Nas Chung wrote:
> > > >> > Hi, Ivan.
> > > >> >
> > > >> > >
> > > >> > >Allocate SRAM memory on module probe, free on remove. There is no
> > > >need
> > > >> > >to allocate on device open, free on close, the memory is the same
> > > >every
> > > >> > >time.
> > > >> >
> > > >> > If there is no decoder/encoder instance, driver don't need to
> > > >allocate SRAM memory.
> > > >> > The main reason of allocating the memory in open() is to allow other
> > > >modules to
> > > >> > use more SRAM memory, if wave5 is not working.
> > > >
> > > >I have to agree with this statement. Moving allocation to probe results
> > > >in wasting SRAM when VPU is not in use. VPU should only be allocating
> > > >SRAM
> > > >when a stream instance is running and free that back once all instances
> > > >close.
> > > >
> > > >> > >
> > > >> > >Also use gen_pool_size() to determine SRAM memory size to be
> > > >allocated
> > > >> > >instead of separate "sram-size" DT property to reduce duplication.
> > > >> > >
> > > >> > >Signed-off-by: Ivan Bornyakov <brnkv.i1@xxxxxxxxx>
> > > >> > >---
> > > >> > > .../platform/chips-media/wave5/wave5-helper.c | 3 ---
> > > >> > > .../platform/chips-media/wave5/wave5-vdi.c | 21 ++++++++++-------
> > > >--
> > > >> > > .../chips-media/wave5/wave5-vpu-dec.c | 2 --
> > > >> > > .../chips-media/wave5/wave5-vpu-enc.c | 2 --
> > > >> > > .../platform/chips-media/wave5/wave5-vpu.c | 12 +++++------
> > > >> > > .../platform/chips-media/wave5/wave5-vpuapi.h | 1 -
> > > >> > > 6 files changed, 16 insertions(+), 25 deletions(-)
> > > >> > >
> > > >> > >diff --git a/drivers/media/platform/chips-media/wave5/wave5-helper.c
> > > >> > >b/drivers/media/platform/chips-media/wave5/wave5-helper.c
> > > >> > >index 8433ecab230c..ec710b838dfe 100644
> > > >> > >--- a/drivers/media/platform/chips-media/wave5/wave5-helper.c
> > > >> > >+++ b/drivers/media/platform/chips-media/wave5/wave5-helper.c
> > > >> > >@@ -29,9 +29,6 @@ void wave5_cleanup_instance(struct vpu_instance
> > > >*inst)
> > > >> > > {
> > > >> > > int i;
> > > >> > >
> > > >> > >- if (list_is_singular(&inst->list))
> > > >> > >- wave5_vdi_free_sram(inst->dev);
> > > >> > >-
> > > >> > > for (i = 0; i < inst->fbc_buf_count; i++)
> > > >> > > wave5_vpu_dec_reset_framebuffer(inst, i);
> > > >> > >
> > > >> > >diff --git a/drivers/media/platform/chips-media/wave5/wave5-vdi.c
> > > >> > >b/drivers/media/platform/chips-media/wave5/wave5-vdi.c
> > > >> > >index 3809f70bc0b4..ee671f5a2f37 100644
> > > >> > >--- a/drivers/media/platform/chips-media/wave5/wave5-vdi.c
> > > >> > >+++ b/drivers/media/platform/chips-media/wave5/wave5-vdi.c
> > > >> > >@@ -174,16 +174,19 @@ int wave5_vdi_allocate_array(struct vpu_device
> > > >> > >*vpu_dev, struct vpu_buf *array,
> > > >> > > void wave5_vdi_allocate_sram(struct vpu_device *vpu_dev)
> > > >> > > {
> > > >> > > struct vpu_buf *vb = &vpu_dev->sram_buf;
> > > >> > >+ dma_addr_t daddr;
> > > >> > >+ void *vaddr;
> > > >> > >+ size_t size;
> > > >> > >
> > > >> > >- if (!vpu_dev->sram_pool || !vpu_dev->sram_size)
> > > >> > >+ if (!vpu_dev->sram_pool || vb->vaddr)
> > > >> > > return;
> > > >> > >
> > > >> > >- if (!vb->vaddr) {
> > > >> > >- vb->size = vpu_dev->sram_size;
> > > >> > >- vb->vaddr = gen_pool_dma_alloc(vpu_dev->sram_pool, vb->size,
> > > >> > >- &vb->daddr);
> > > >> > >- if (!vb->vaddr)
> > > >> > >- vb->size = 0;
> > > >> > >+ size = gen_pool_size(vpu_dev->sram_pool);
> > > >> > >+ vaddr = gen_pool_dma_alloc(vpu_dev->sram_pool, size, &daddr);
> > > >> > >+ if (vaddr) {
> > > >> > >+ vb->vaddr = vaddr;
> > > >> > >+ vb->daddr = daddr;
> > > >> > >+ vb->size = size;
> > > >> > > }
> > > >> > >
> > > >> > > dev_dbg(vpu_dev->dev, "%s: sram daddr: %pad, size: %zu, vaddr:
> > > >> > >0x%p\n",
> > > >> > >@@ -197,9 +200,7 @@ void wave5_vdi_free_sram(struct vpu_device
> > > >*vpu_dev)
> > > >> > > if (!vb->size || !vb->vaddr)
> > > >> > > return;
> > > >> > >
> > > >> > >- if (vb->vaddr)
> > > >> > >- gen_pool_free(vpu_dev->sram_pool, (unsigned long)vb->vaddr,
> > > >> > >- vb->size);
> > > >> > >+ gen_pool_free(vpu_dev->sram_pool, (unsigned long)vb->vaddr, vb-
> > > >> > >>size);
> > > >> > >
> > > >> > > memset(vb, 0, sizeof(*vb));
> > > >> > > }
> > > >> > >diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-
> > > >dec.c
> > > >> > >b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
> > > >> > >index aa0401f35d32..84dbe56216ad 100644
> > > >> > >--- a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
> > > >> > >+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
> > > >> > >@@ -1854,8 +1854,6 @@ static int wave5_vpu_open_dec(struct file
> > > >*filp)
> > > >> > > goto cleanup_inst;
> > > >> > > }
> > > >> > >
> > > >> > >- wave5_vdi_allocate_sram(inst->dev);
> > > >> > >-
> > > >> > > return 0;
> > > >> > >
> > > >> > > cleanup_inst:
> > > >> > >diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-
> > > >enc.c
> > > >> > >b/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c
> > > >> > >index 8bbf9d10b467..86ddcb82443b 100644
> > > >> > >--- a/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c
> > > >> > >+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c
> > > >> > >@@ -1727,8 +1727,6 @@ static int wave5_vpu_open_enc(struct file
> > > >*filp)
> > > >> > > goto cleanup_inst;
> > > >> > > }
> > > >> > >
> > > >> > >- wave5_vdi_allocate_sram(inst->dev);
> > > >> > >-
> > > >> > > return 0;
> > > >> > >
> > > >> > > cleanup_inst:
> > > >> > >diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu.c
> > > >> > >b/drivers/media/platform/chips-media/wave5/wave5-vpu.c
> > > >> > >index f3ecadefd37a..2a0a70dd7062 100644
> > > >> > >--- a/drivers/media/platform/chips-media/wave5/wave5-vpu.c
> > > >> > >+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu.c
> > > >> > >@@ -178,16 +178,11 @@ static int wave5_vpu_probe(struct
> > > >platform_device
> > > >> > >*pdev)
> > > >> > > return ret;
> > > >> > > }
> > > >> > >
> > > >> > >- ret = of_property_read_u32(pdev->dev.of_node, "sram-size",
> > > >> > >- &dev->sram_size);
> > > >> > >- if (ret) {
> > > >> > >- dev_warn(&pdev->dev, "sram-size not found\n");
> > > >> > >- dev->sram_size = 0;
> > > >> > >- }
> > > >> > >-
> > > >> >
> > > >> > Required SRAM size is different from each wave5 product.
> > > >> > And, SoC vendor also can configure the different SRAM size
> > > >> > depend on target SoC specification even they use the same wave5
> > > >product.
> > > >> >
> > > >>
> > > >> One can limit iomem address range in SRAM node. Here is the example of
> > > >> how I setup Wave515 with SRAM:
> > > >>
> > > >> sram@2000000 {
> > > >> compatible = "mmio-sram";
> > > >> reg = <0x0 0x2000000 0x0 0x80000>;
> > > >> #address-cells = <1>;
> > > >> #size-cells = <1>;
> > > >> ranges = <0x0 0x0 0x2000000 0x80000>;
> > > >>
> > > >> wave515_vpu_sram: wave515-vpu-sram@0 {
> > > >> reg = <0x0 0x80000>;
> > > >> pool;
> > > >> };
> > > >> };
> > > >>
> > > >> wave515@410000 {
> > > >> compatible = "cnm,wave515";
> > > >> reg = <0x0 0x410000 0x0 0x10000>;
> > > >> clocks = <&clk_ref1>;
> > > >> clock-names = "videc";
> > > >> interrupt-parent = <&wave515_intc>;
> > > >> interrupts = <16 IRQ_TYPE_LEVEL_HIGH>;
> > > >> resets = <&wave515_reset 0>,
> > > >> <&wave515_reset 4>,
> > > >> <&wave515_reset 8>,
> > > >> <&wave515_reset 12>;
> > > >> sram = <&wave515_vpu_sram>;
> > > >> };
> > > >>
> > > >> gen_pool_size() returns size of wave515_vpu_sram, no need for extra
> > > >> "sram-size" property.
> > >
> > > Thanks for sharing the example.
> > > I agree that the "sram-size" property is not needed.
> > >
> > > >
> > > >"sram-size" property does need to be removed, as this was the consensus
> > > >gathered from my patch[0]. However, I think your method is still taking
> > >
> > > I missed the previous consensus for the sram-size property.
> > > Thanks for letting me know.
> > >
> > > >a more static approach. One of the recommendations in my thread[1] was
> > > >making a list of known SRAM sizes given typical resolutions and
> > > >iterating through until a valid allocation is done. I don't think this
> > > >is the correct approach either based on Nas's comment that each Wave5
> > > >has different SRAM size requirement. It would clutter up the file too
> > > >much if each wave5 product had its own SRAM size mapping.
> > > >
> > > >Could another approach be to change Wave5 dts node to have property set
> > > >as "sram = <&sram>;" in your example, then driver calls
> > > >gen_pool_availble to get size remaining? From there, a check could be
> > > >put in place to make sure an unnecessary amount is not being allocated.
> > >
> > > Ivan's approach looks good to me.
> > > It is similar to your first patch, which adds the sram-size property
> > > to configure different SRAM sizes for each device.
> > > And, Driver won't know unnecessary amount is allocated before parsing
> > > bitstream header.
>
> I am aware of this, I should have been more specific. By unnecessary
> amount, I meant something greater than the max use case for device.
> Could we populate some macros that have max SRAM required for 4K stream?
> There's never a need to allocate more SRAM than that for a particular
> instance. If the amount available is less than that, then fine. But it
> should never be greater.
>
> > >
> >
> > To sum up, there is 2 favourable approaches:
> >
> > 1) to have dedicated SRAM partition for Wave5 VPU as suggested in this
> > patchset. In this approach SoC vendor can setup address range of said
> > partition to their needs, but other devices won't be able to use SRAM
> > memory reserved for Wave5 VPU, unless other device's SRAM memory needs
> > don't exceed the size of reserved partition.
> >
> > Therefore it is sensible to substitute alloc/free on open/close with
> > alloc/free on open/close.
>
> Not sure what you mean here. Were you trying to refer to your
> substitution of alloc/free from open/close to probe/remove?
>
> If that is what you mean, and the decision is a specific carveout for
> SRAM, then I don't see a point in having allocation in open and close
> either since Wave5 would be the only IP that could use the pool.
>
> >
> > Advantages: driver code is simpler, no need for platform-specific defines
> > or DT properties. Wave5 is guaranteed to get SRAM memory.
> >
> > Disadvantage: waste of SRAM memory while VPU is not in use
> >
> > 2) allocate all available SRAM memory on open (free on close) from the
> > common SRAM pool, but limit maximum amount with SoC-specific define.
> >
>
> Why does it have to be on SoC specific define?
Well, if I understood correctly, in [1] Nas said that SRAM usage is
SoC-specific even with same Wave5 IP.
[1] https://lore.kernel.org/linux-media/SL2P216MB1246F7FA7E95896AA2409C90FB2C2@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/
> Max size required for SRAM in a 4K case is known.
>From docs I have for Wave515 it's _seems_ to be about 64K, but it's not
too clear.
> A call can be made to get the size of the
> pool and from there the driver can take a portion. Just make sure that
> portion is less than known value for 4K.
>
Yeah, I did exactly that in v2, was about to send, until I got
"Ivan's approach looks good to me" :)
> > Advantage: less memory waste
> >
> > Disadvantages: still need SoC-specific define or DT property, not much
> > differ from current state. Wave5 is not guaranteed to get SRAM memory.
> >
>
> Wave5 does not need SRAM to function properly so it doesn't have to be
> guaranteed.
>
True.
> > Which of these approaches would be preferable?
> >
> > > >
> > > >
> > > >[0]:
> > > >https://lore.kernel.org/lkml/99bf4d6d988d426492fffc8de9015751c323bd8a.cam
> > > >el@xxxxxxxxxxxx/
> > > >[1]: https://lore.kernel.org/lkml/9c5b7b2c-8a66-4173-dfe9-
> > > >5724ec5f733d@xxxxxx/
> > > >
> > > >Thanks,
> > > >Brandon
> > > >>
> > > >> > Thanks.
> > > >> > Nas.
> > > >> >
> > > >> > > dev->sram_pool = of_gen_pool_get(pdev->dev.of_node, "sram", 0);
> > > >> > > if (!dev->sram_pool)
> > > >> > > dev_warn(&pdev->dev, "sram node not found\n");
> > > >> > >+ else
> > > >> > >+ wave5_vdi_allocate_sram(dev);
> > > >> > >
> > > >> > > dev->product_code = wave5_vdi_read_register(dev,
> > > >> > >VPU_PRODUCT_CODE_REGISTER);
> > > >> > > ret = wave5_vdi_init(&pdev->dev);
> > > >> > >@@ -259,6 +254,8 @@ static int wave5_vpu_probe(struct
> > > >platform_device
> > > >> > >*pdev)
> > > >> > > err_clk_dis:
> > > >> > > clk_bulk_disable_unprepare(dev->num_clks, dev->clks);
> > > >> > >
> > > >> > >+ wave5_vdi_free_sram(dev);
> > > >> > >+
> > > >> > > return ret;
> > > >> > > }
> > > >> > >
> > > >> > >@@ -275,6 +272,7 @@ static void wave5_vpu_remove(struct
> > > >platform_device
> > > >> > >*pdev)
> > > >> > > v4l2_device_unregister(&dev->v4l2_dev);
> > > >> > > wave5_vdi_release(&pdev->dev);
> > > >> > > ida_destroy(&dev->inst_ida);
> > > >> > >+ wave5_vdi_free_sram(dev);
> > > >> > > }
> > > >> > >
> > > >> > > static const struct wave5_match_data ti_wave521c_data = {
> > > >> > >diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
> > > >> > >b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
> > > >> > >index fa62a85080b5..8d88381ac55e 100644
> > > >> > >--- a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
> > > >> > >+++ b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
> > > >> > >@@ -749,7 +749,6 @@ struct vpu_device {
> > > >> > > struct vpu_attr attr;
> > > >> > > struct vpu_buf common_mem;
> > > >> > > u32 last_performance_cycles;
> > > >> > >- u32 sram_size;
> > > >> > > struct gen_pool *sram_pool;
> > > >> > > struct vpu_buf sram_buf;
> > > >> > > void __iomem *vdb_register;
> > > >> > >--
> > > >> > >2.44.0
> > > >> >
> > > >>