Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing

From: Keiichi Watanabe
Date: Tue Jul 24 2018 - 00:18:33 EST


Hi Kieran,

How is it going?
I would appreciate if you could review this patch.

Best regards,
Keiichi

On Thu, Jun 28, 2018 at 2:21 AM, Kieran Bingham
<kieran.bingham@xxxxxxxxxxxxxxxx> wrote:
> Hi Keiichi,
>
> I just wanted to reply here quickly to say this is a really interesting
> development.
>
> I would like to review and test it properly; however I am now on holiday
> until at least the 9th July and so I won't be able to look until after
> that date.
>
> I didn't want you to be left hanging though :-)
>
> Ping me if you haven't heard anything by the 20th or so. Of course if
> anyone else gets round to testing and reviewing then that's great too.
>
> Regards
> --
> Kieran
>
>
>
> On 27/06/18 11:34, Keiichi Watanabe wrote:
>> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
>> uncached memory allocation methods. In such situations, it sometimes
>> takes a long time to access URB buffers. This can be a cause of video
>> flickering problems if a resolution is high and a USB controller has
>> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
>> header data from (uncached) URB buffer into (cached) local buffer.
>>
>> This change should make the elapsed time of the interrupt handler
>> shorter on platforms with non-coherent DMA. We measured the elapsed
>> time of each callback of uvc_video_complete without/with this patch
>> while capturing Full HD video in
>> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
>> I tested it on the top of Kieran Bingham's Asynchronous UVC series
>> https://www.mail-archive.com/linux-media@xxxxxxxxxxxxxxx/msg128359.html.
>> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
>> I collected data for 5 seconds. (There were around 480 callbacks in
>> this case.) The following result shows that this patch makes
>> uvc_video_complete about 2x faster.
>>
>> | average | median | min | max | standard deviation
>> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
>> w/ caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
>>
>> In addition, we confirmed that this patch doesn't make it worse on
>> coherent DMA architecture by performing the same measurements on a
>> Broadwell Chromebox with the same camera.
>>
>> | average | median | min | max | standard deviation
>> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
>> w/ caching| 20728ns | 20398ns | 8922ns | 45120ns | 3368ns
>>
>> Signed-off-by: Keiichi Watanabe <keiichiw@xxxxxxxxxxxx>
>> ---
>>
>> After applying 6 patches in
>> https://www.mail-archive.com/linux-media@xxxxxxxxxxxxxxx/msg128359.html,
>> I measured elapsed time by adding the following code to
>> /drivers/media/usb/uvc/uvc_video.c
>>
>> @@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
>> struct uvc_video_queue *queue = &stream->queue;
>> struct uvc_buffer *buf = NULL;
>> int ret;
>> + ktime_t start, end;
>> + int elapsed_time;
>> + start = ktime_get();
>> switch (urb->status) {
>> case 0:
>>
>> @@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)
>>
>> INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
>> queue_work(stream->async_wq, &uvc_urb->work);
>> +
>> + end = ktime_get();
>> + elapsed_time = ktime_to_ns(ktime_sub(end, start));
>> + pr_err("elapsed time: %d ns", elapsed_time);
>> }
>>
>> /*
>>
>>
>> drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
>> 1 file changed, 43 insertions(+), 49 deletions(-)
>> diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
>> index a88b2e51a666..ff2eddc55530 100644
>> --- a/drivers/media/usb/uvc/uvc_video.c
>> +++ b/drivers/media/usb/uvc/uvc_video.c
>> @@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)
>>
>> static void
>> uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer *buf,
>> - const u8 *data, int len)
>> + const u8 *data, int len, unsigned int header_size,
>> + bool has_pts, bool has_scr)
>> {
>> struct uvc_clock_sample *sample;
>> - unsigned int header_size;
>> - bool has_pts = false;
>> - bool has_scr = false;
>> unsigned long flags;
>> ktime_t time;
>> u16 host_sof;
>> u16 dev_sof;
>>
>> - switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> - case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> - header_size = 12;
>> - has_pts = true;
>> - has_scr = true;
>> - break;
>> - case UVC_STREAM_PTS:
>> - header_size = 6;
>> - has_pts = true;
>> - break;
>> - case UVC_STREAM_SCR:
>> - header_size = 8;
>> - has_scr = true;
>> - break;
>> - default:
>> - header_size = 2;
>> - break;
>> - }
>> -
>> /* Check for invalid headers. */
>> if (len < header_size)
>> return;
>> @@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming *stream,
>> */
>>
>> static void uvc_video_stats_decode(struct uvc_streaming *stream,
>> - const u8 *data, int len)
>> + const u8 *data, int len,
>> + unsigned int header_size, bool has_pts,
>> + bool has_scr)
>> {
>> - unsigned int header_size;
>> - bool has_pts = false;
>> - bool has_scr = false;
>> u16 uninitialized_var(scr_sof);
>> u32 uninitialized_var(scr_stc);
>> u32 uninitialized_var(pts);
>> @@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream,
>> stream->stats.frame.nb_packets == 0)
>> stream->stats.stream.start_ts = ktime_get();
>>
>> - switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> - case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> - header_size = 12;
>> - has_pts = true;
>> - has_scr = true;
>> - break;
>> - case UVC_STREAM_PTS:
>> - header_size = 6;
>> - has_pts = true;
>> - break;
>> - case UVC_STREAM_SCR:
>> - header_size = 8;
>> - has_scr = true;
>> - break;
>> - default:
>> - header_size = 2;
>> - break;
>> - }
>> -
>> /* Check for invalid headers. */
>> if (len < header_size || data[0] < header_size) {
>> stream->stats.frame.nb_invalid++;
>> @@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming *stream)
>> * to be called with a NULL buf parameter. uvc_video_decode_data and
>> * uvc_video_decode_end will never be called with a NULL buffer.
>> */
>> +static void uvc_video_decode_header_size(const u8 *data, int *header_size,
>> + bool *has_pts, bool *has_scr)
>> +{
>> + switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> + case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> + *header_size = 12;
>> + *has_pts = true;
>> + *has_scr = true;
>> + break;
>> + case UVC_STREAM_PTS:
>> + *header_size = 6;
>> + *has_pts = true;
>> + break;
>> + case UVC_STREAM_SCR:
>> + *header_size = 8;
>> + *has_scr = true;
>> + break;
>> + default:
>> + *header_size = 2;
>> + }
>> +}
>> +
>> static int uvc_video_decode_start(struct uvc_streaming *stream,
>> - struct uvc_buffer *buf, const u8 *data, int len)
>> + struct uvc_buffer *buf, const u8 *urb_data,
>> + int len)
>> {
>> u8 fid;
>> + u8 data[12];
>> + unsigned int header_size;
>> + bool has_pts = false, has_scr = false;
>> +
>> + /* Cache the header since urb_data is uncached memory. The size of
>> + * header is at most 12 bytes.
>> + */
>> + memcpy(data, urb_data, min(len, 12));
>>
>> /* Sanity checks:
>> * - packet must be at least 2 bytes long
>> @@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
>> uvc_video_stats_update(stream);
>> }
>>
>> - uvc_video_clock_decode(stream, buf, data, len);
>> - uvc_video_stats_decode(stream, data, len);
>> + uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);
>> +
>> + uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
>> + has_scr);
>> + uvc_video_stats_decode(stream, data, len, header_size, has_pts,
>> + has_scr);
>>
>> /* Store the payload FID bit and return immediately when the buffer is
>> * NULL.
>> --
>> 2.18.0.rc2.346.g013aa6912e-goog
>>
>
> --
> Regards
> --
> Kieran