From f417942efbe5abddf902d6b0704717d3b068605f Mon Sep 17 00:00:00 2001 From: capjamesg Date: Thu, 15 Aug 2024 08:00:27 +0000 Subject: [PATCH] Update results --- index.html | 148 ++++++++++++++++++++-------------------- results/2024-08-15.json | 106 ++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 75 deletions(-) create mode 100644 results/2024-08-15.json diff --git a/index.html b/index.html index 8e2af90..3a466d9 100644 --- a/index.html +++ b/index.html @@ -40,7 +40,7 @@

How's GPT-4o Doing?

You can contribute your own tests, too! See the GitHub README for contributing instructions.

-

Tests are run every day at 1am PT. Last updated August 14, 2024.

+

Tests are run every day at 1am PT. Last updated August 15, 2024.

Made with ❤️ by the team at Roboflow.

@@ -122,66 +122,12 @@

Prompt

Image

Image of the input into GPT-4

Result

-
7
+
There are 8 fruits in the image.

Test submitted by Roboflow

- -
-
-
-

Handwriting OCR

-

Can GPT-4V read handwriting?

-
-
-
-

Fail

-
-
-
-
-
- Last 7-Day Performance -
- -
- -
- -
- -
- -
- -
- -
- -
-
-

Of the last 7 tests, conducted daily, this test has passed 86.0% of the time.

-

Today's request cost $0.009

-
-
- -
-

Method

-
We send a image of a handwritten note to determine if it can correctly read the text. If it correctly gets the text, it gets a 100%. Otherwise, it gets a 0%.
-

Prompt

-
-                                            Read the text in the image. Return only the text, with punctuation.
-                                        
-

Image

- Image of the input into GPT-4 -

Result

-
The words of songs on the album have been echoing in my head all week. "Fades into the grey of my day oh tea."
-

Test submitted by Roboflow

-
-
-
- +
@@ -230,7 +176,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
{'x': 0.42, 'y': 0.46, 'width': 0.18, 'height': 0.28}
+
{'x': 0.345, 'y': 0.371, 'width': 0.242, 'height': 0.397}

Test submitted by Roboflow

@@ -287,19 +233,19 @@

Result

```json
 {
   "A": {
-    "quantity": 20,
+    "quantity": 15,
     "price": 10
   },
   "B": {
-    "quantity": 23,
+    "quantity": 25,
     "price": 20
   },
   "C": {
-    "quantity": 27,
+    "quantity": 30,
     "price": 30
   },
   "D": {
-    "quantity": 30,
+    "quantity": 35,
     "price": 40
   }
 }
@@ -358,9 +304,9 @@ 

Image

Image of the input into GPT-4

Result

Failed to produce a valid JSON output: {
-  "R": 80,
+  "R": 79,
   "G": 0,
-  "B": 139
+  "B": 128
 }

Test submitted by Roboflow

@@ -401,7 +347,7 @@

Annotation Quality Assurance

Of the last 7 tests, conducted daily, this test has passed 0% of the time.

-

Today's request cost $0.017

+

Today's request cost $0.016

@@ -415,15 +361,13 @@

Prompt

Image

Image of the input into GPT-4

Result

-
Based on the image provided, it appears there are two cars on the left side of the image that have not been labeled with red bounding boxes. 
-
-Hence, the JSON output for the number of missing annotations is:
-
-```json
+                                        
```json
 {
-  "missing": 2
+  "missing": 1
 }
-```
+``` + +In the provided image, there are seven cars visible, but only six are labeled with red bounding boxes. Therefore, there is one missing annotation.

Test submitted by Roboflow

@@ -604,7 +548,61 @@

Result

- + +
+
+
+

Handwriting OCR

+

Can GPT-4V read handwriting?

+
+
+
+

Pass

+
+
+
+
+
+ Last 7-Day Performance +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+
+

Of the last 7 tests, conducted daily, this test has passed 86.0% of the time.

+

Today's request cost $0.009

+
+
+ +
+

Method

+
We send a image of a handwritten note to determine if it can correctly read the text. If it correctly gets the text, it gets a 100%. Otherwise, it gets a 0%.
+

Prompt

+
+                                            Read the text in the image. Return only the text, with punctuation.
+                                        
+

Image

+ Image of the input into GPT-4 +

Result

+
The words of songs on the album have been echoing in my head all week. "Fades into the grey of my day old tea."
+

Test submitted by Roboflow

+
+
+
+
@@ -693,7 +691,7 @@

Math OCR

Of the last 7 tests, conducted daily, this test has passed 100% of the time.

-

Today's request cost $0.016

+

Today's request cost $0.015

@@ -761,7 +759,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
charybdis indubitable
+
charybdis in-dubitable

Test submitted by Charles Frye

diff --git a/results/2024-08-15.json b/results/2024-08-15.json new file mode 100644 index 0000000..1b5e88b --- /dev/null +++ b/results/2024-08-15.json @@ -0,0 +1,106 @@ +{ + "zero_shot_classification": { + "score": 1, + "success": true, + "price": 0.00481, + "pass_fail": "Pass", + "response_time": 2.920799493789673, + "result": "Toyota Camry" + }, + "count_fruit": { + "score": 0, + "success": false, + "price": 0.008110000000000001, + "pass_fail": "Fail", + "response_time": 3.9328646659851074, + "result": "There are 8 fruits in the image." + }, + "document_ocr": { + "score": 1, + "success": true, + "price": 0.008539999999999999, + "pass_fail": "Pass", + "response_time": 4.2253382205963135, + "result": "I was thinking earlier today that I have gone through, to use the lingo, eras of listening to each of Swift's Eras. Meta indeed. I started listening to Ms. Swift's music after hearing the Midnights album. A few weeks after hearing the album for the first time, I found myself playing various songs on repeat. I listened to the album in order multiple times." + }, + "handwriting_ocr": { + "score": 1, + "success": true, + "price": 0.00876, + "pass_fail": "Pass", + "response_time": 5.472809791564941, + "result": "The words of songs on the album have been echoing in my head all week. \"Fades into the grey of my day old tea.\"" + }, + "extraction_ocr": { + "score": 1.0, + "success": true, + "price": 0.00719, + "pass_fail": "Pass", + "response_time": 3.0897085666656494, + "result": "[{'name': 'Mary Thomas', 'time_per_day': 1, 'medication': 'Atenolol', 'dosage': 100, 'rx_number': '1234567-12345'}]" + }, + "math_ocr": { + "score": 1.0, + "success": true, + "price": 0.015290000000000002, + "pass_fail": "Pass", + "response_time": 3.207291603088379, + "result": "3x^2-6x+2" + }, + "object_detection": { + "score": 0.36390302373693995, + "success": false, + "price": 0.009490000000000002, + "pass_fail": "Fail", + "response_time": 3.2578036785125732, + "result": "{'x': 0.345, 'y': 0.371, 'width': 0.242, 'height': 0.397}" + }, + "graph_understanding": { + "score": 0.965, + "success": false, + "price": 0.01079, + "pass_fail": "Fail", + "response_time": 3.014864683151245, + "result": "```json\n{\n \"A\": {\n \"quantity\": 15,\n \"price\": 10\n },\n \"B\": {\n \"quantity\": 25,\n \"price\": 20\n },\n \"C\": {\n \"quantity\": 30,\n \"price\": 30\n },\n \"D\": {\n \"quantity\": 35,\n \"price\": 40\n }\n}\n```" + }, + "color_recognition": { + "score": 0, + "success": false, + "price": 0.00875, + "pass_fail": "Fail", + "response_time": 4.1007609367370605, + "result": "Failed to produce a valid JSON output: {\n \"R\": 79,\n \"G\": 0,\n \"B\": 128\n}" + }, + "annotation_qa": { + "score": 0.33333333333333337, + "success": false, + "price": 0.016200000000000003, + "pass_fail": "Fail", + "response_time": 4.372694730758667, + "result": "```json\n{\n \"missing\": 1\n}\n```\n\nIn the provided image, there are seven cars visible, but only six are labeled with red bounding boxes. Therefore, there is one missing annotation." + }, + "measurement": { + "score": 0.8571428571428572, + "success": false, + "price": 0.00877, + "pass_fail": "Fail", + "response_time": 6.369856357574463, + "result": "```json\n{\n \"length\": 3.0,\n \"width\": 3.0\n}\n```" + }, + "easy_captcha": { + "score": 1, + "success": true, + "price": 0.0048200000000000005, + "pass_fail": "Pass", + "response_time": 1.162583589553833, + "result": "charybdis in-dubitable" + }, + "easy_captcha_persuade": { + "score": 1, + "success": true, + "price": 0.00529, + "pass_fail": "Pass", + "response_time": 1.37984037399292, + "result": "charybdis indubitable" + } +} \ No newline at end of file