Skip to content

Commit 5f41113

Browse files
committed
feat: add key information extraction sample
1 parent e0f10bd commit 5f41113

12 files changed

+341
-4
lines changed

Create-TestFiles.ps1

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ param (
4747
# --- 2. 定义路径和文件名 ---
4848

4949
# 定义基础路径
50-
$basePath = "src/Cnblogs.DashScope.Tests.Shared/RawHttpData"
50+
$basePath = "test/Cnblogs.DashScope.Tests.Shared/RawHttpData"
5151

5252
# 构建文件名的基础部分
5353
$baseFileName = "$S-$P"

README.md

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,107 @@ Location: [356,175,401,175,401,207,356,207]
733733
RotateRect: [378,191,32,45,90]
734734
````
735735

736+
##### Key Information Extraction
737+
738+
When using built-in tasks, it is not recommended to enable streaming; otherwise, `completion.Output.Choices[0].Message.Content[0].OcrResult.KvResult` will be `null`.
739+
740+
To invoke this built-in task, set `Parameters.OcrOptions.Task` to `key_information_extraction`. No additional text information needs to be provided.
741+
742+
You can customize the output JSON format via `Parameters.OcrOptions.TaskConfig.ResultSchema` (with a maximum of 3 levels of nesting). If left blank, all fields will be output by default.
743+
744+
For example, suppose we want to extract objects of the following type from an image (JSON property names should, as much as possible, be based on the text present in the image):
745+
746+
```csharp
747+
internal class ReceiptModel()
748+
{
749+
[JsonPropertyName("乘车日期")]
750+
public string? Date { get; init; }
751+
752+
[JsonPropertyName("发票")]
753+
public ReceiptSerials? Serials { get; init; }
754+
}
755+
756+
internal class ReceiptSerials
757+
{
758+
[JsonPropertyName("发票代码")]
759+
public string? Code { get; init; }
760+
761+
[JsonPropertyName("发票号码")]
762+
public string? Serial { get; init; }
763+
}
764+
```
765+
766+
Property could be `null` if model failed to extract value for it.
767+
Example request:
768+
769+
```csharp
770+
await using var file = File.OpenRead("receipt.jpg");
771+
var ossLink = await client.UploadTemporaryFileAsync("qwen-vl-ocr-latest", file, "receipt.jpg");
772+
Console.WriteLine($"File uploaded: {ossLink}");
773+
var messages =
774+
new List<MultimodalMessage> { MultimodalMessage.User([MultimodalMessageContent.ImageContent(ossLink)]) };
775+
var completion = await client.GetMultimodalGenerationAsync(
776+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
777+
{
778+
Model = "qwen-vl-ocr-latest",
779+
Input = new MultimodalInput() { Messages = messages },
780+
Parameters = new MultimodalParameters()
781+
{
782+
OcrOptions = new MultimodalOcrOptions()
783+
{
784+
Task = "key_information_extraction",
785+
TaskConfig = new MultimodalOcrTaskConfig()
786+
{
787+
ResultSchema = new Dictionary<string, object>()
788+
{
789+
{
790+
"发票",
791+
new Dictionary<string, string>()
792+
{
793+
{ "发票代码", "提取图中的发票代码,通常为一组数字或字母组合" },
794+
{ "发票号码", "提取发票上的号码,通常由纯数字组成。" }
795+
}
796+
},
797+
{ "乘车日期", "对应图中乘车日期时间,格式为年-月-日,比如2025-03-05" }
798+
}
799+
}
800+
}
801+
}
802+
});
803+
```
804+
805+
Consume:
806+
807+
`KvResult` is `JsonElement`,you can deserialize it to any type you desire, or you could just use `Dictionary<string, JsonElement?>` .
808+
809+
````csharp
810+
Console.WriteLine("Text:");
811+
Console.WriteLine(completion.Output.Choices[0].Message.Content[0].Text);
812+
Console.WriteLine("KvResults:");
813+
var model = completion.Output.Choices[0].Message.Content[0].OcrResult!.KvResult?.Deserialize<ReceiptModel>();
814+
Console.WriteLine($"Date: {model?.Date}");
815+
Console.WriteLine($"Code: {model?.Serials?.Code}");
816+
Console.WriteLine($"Serial: {model?.Serials?.Serial}");
817+
818+
/*
819+
Text:
820+
```json
821+
{
822+
"乘车日期": "2013-06-29",
823+
"发票": {
824+
"发票代码": "221021325353",
825+
"发票号码": "10283819"
826+
}
827+
}
828+
```
829+
KvResults:
830+
Date: 2013-06-29
831+
Code: 221021325353
832+
Serial: 10283819
833+
Usage: in(524)/out(65)/image(310)/total(589)
834+
*/
835+
````
836+
736837

737838

738839
## Text-to-Speech

sample/Cnblogs.DashScope.Sample/Cnblogs.DashScope.Sample.csproj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
<None Update="webpage.jpg">
3636
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
3737
</None>
38+
<None Update="receipt.jpg">
39+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
40+
</None>
3841
</ItemGroup>
3942

4043
<ItemGroup>
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
using System.Text.Json;
2+
using System.Text.Json.Serialization;
3+
using Cnblogs.DashScope.Core;
4+
5+
namespace Cnblogs.DashScope.Sample.Multimodal;
6+
7+
public class OcrKeyInformationExtractionSample : ISample
8+
{
9+
/// <inheritdoc />
10+
public string Description => "OCR Key Information Extraction Sample";
11+
12+
/// <inheritdoc />
13+
public async Task RunAsync(IDashScopeClient client)
14+
{
15+
// upload file
16+
await using var file = File.OpenRead("receipt.jpg");
17+
var ossLink = await client.UploadTemporaryFileAsync("qwen-vl-ocr-latest", file, "receipt.jpg");
18+
Console.WriteLine($"File uploaded: {ossLink}");
19+
var messages =
20+
new List<MultimodalMessage> { MultimodalMessage.User([MultimodalMessageContent.ImageContent(ossLink)]) };
21+
var completion = await client.GetMultimodalGenerationAsync(
22+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
23+
{
24+
Model = "qwen-vl-ocr-latest",
25+
Input = new MultimodalInput() { Messages = messages },
26+
Parameters = new MultimodalParameters()
27+
{
28+
OcrOptions = new MultimodalOcrOptions()
29+
{
30+
Task = "key_information_extraction",
31+
TaskConfig = new MultimodalOcrTaskConfig()
32+
{
33+
ResultSchema = new Dictionary<string, object>()
34+
{
35+
{
36+
"发票",
37+
new Dictionary<string, string>()
38+
{
39+
{ "发票代码", "提取图中的发票代码,通常为一组数字或字母组合" },
40+
{ "发票号码", "提取发票上的号码,通常由纯数字组成。" }
41+
}
42+
},
43+
{ "乘车日期", "对应图中乘车日期时间,格式为年-月-日,比如2025-03-05" }
44+
}
45+
}
46+
}
47+
}
48+
});
49+
50+
Console.WriteLine("Text:");
51+
Console.WriteLine(completion.Output.Choices[0].Message.Content[0].Text);
52+
Console.WriteLine("KvResults:");
53+
var model = completion.Output.Choices[0].Message.Content[0].OcrResult!.KvResult?.Deserialize<ReceiptModel>();
54+
Console.WriteLine($"Date: {model?.Date}");
55+
Console.WriteLine($"Code: {model?.Serials?.Code}");
56+
Console.WriteLine($"Serial: {model?.Serials?.Serial}");
57+
58+
if (completion.Usage != null)
59+
{
60+
var usage = completion.Usage;
61+
Console.WriteLine(
62+
$"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/total({usage.TotalTokens})");
63+
}
64+
}
65+
}
66+
67+
internal class ReceiptModel()
68+
{
69+
[JsonPropertyName("乘车日期")]
70+
public string? Date { get; init; }
71+
72+
[JsonPropertyName("发票")]
73+
public ReceiptSerials? Serials { get; init; }
74+
}
75+
76+
internal class ReceiptSerials
77+
{
78+
[JsonPropertyName("发票代码")]
79+
public string? Code { get; init; }
80+
81+
[JsonPropertyName("发票号码")]
82+
public string? Serial { get; init; }
83+
}
84+
85+
86+
/*
87+
File uploaded: oss://dashscope-instant/52afe077fb4825c6d74411758cb1ab98/2025-11-29/16a422bd-811b-435a-9e2d-8538784dc64d/receipt.jpg
88+
Text:
89+
```json
90+
{
91+
"乘车日期": "2013-06-29",
92+
"发票": {
93+
"发票代码": "221021325353",
94+
"发票号码": "10283819"
95+
}
96+
}
97+
```
98+
KvResults:
99+
Date: 2013-06-29
100+
Code: 221021325353
101+
Serial: 10283819
102+
Usage: in(524)/out(65)/image(310)/total(589)
103+
*/
94.3 KB
Loading
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
namespace Cnblogs.DashScope.Core;
1+
using System.Text.Json;
2+
3+
namespace Cnblogs.DashScope.Core;
24

35
/// <summary>
46
/// OCR result from the model.
57
/// </summary>
68
/// <param name="WordsInfo">The words that model recognized.</param>
79
/// <param name="KvResult">Meta info that extracted from the image.</param>
8-
public record MultimodalOcrResult(List<MultimodalOcrWordInfo>? WordsInfo, Dictionary<string, object?>? KvResult);
10+
public record MultimodalOcrResult(List<MultimodalOcrWordInfo>? WordsInfo, JsonElement? KvResult);

test/Cnblogs.DashScope.Sdk.UnitTests/MultimodalGenerationSerializationTests.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ ModelResponse<MultimodalOutput, MultimodalTokenUsage>>> NoSseData
7575
Snapshots.MultimodalGeneration.AudioNoSse,
7676
Snapshots.MultimodalGeneration.OcrNoSse,
7777
Snapshots.MultimodalGeneration.OcrAdvancedRecognitionNoSse,
78+
Snapshots.MultimodalGeneration.OcrKeyInformationExtractionNoSse,
7879
Snapshots.MultimodalGeneration.VideoNoSse
7980
};
8081

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"model": "qwen-vl-ocr-latest",
3+
"input":{
4+
"messages":[
5+
{
6+
"role": "user",
7+
"content": [
8+
{
9+
"image": "https://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg"
10+
}
11+
]
12+
}
13+
]
14+
},
15+
"parameters": {
16+
"ocr_options": {
17+
"task": "key_information_extraction",
18+
"task_config": {
19+
"result_schema": {
20+
"乘车日期": "对应图中乘车日期时间,格式为年-月-日,比如2025-03-05",
21+
"发票": {
22+
"发票代码": "提取图中的发票代码,通常为一组数字或字母组合",
23+
"发票号码": "提取发票上的号码,通常由纯数字组成。"
24+
}
25+
}
26+
}
27+
}
28+
}
29+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
POST /api/v1/services/aigc/multimodal-generation/generation HTTP/1.1
2+
Content-Type: application/json
3+
Accept: */*
4+
Cache-Control: no-cache
5+
Host: dashscope.aliyuncs.com
6+
Accept-Encoding: gzip, deflate, br
7+
Connection: keep-alive
8+
Content-Length: 940
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"output":{"choices":[{"finish_reason":"stop","message":{"content":[{"ocr_result":{"kv_result":{"乘车日期":"2013-06-29","发票":{"发票代码":"221021325353","发票号码":"10283819"}}},"text":"```json\n{\n \"乘车日期\": \"2013-06-29\",\n \"发票\": {\n \"发票代码\": \"221021325353\",\n \"发票号码\": \"10283819\"\n }\n}\n```"}],"role":"assistant"}}]},"usage":{"image_tokens":310,"input_tokens":524,"input_tokens_details":{"image_tokens":310,"text_tokens":214},"output_tokens":65,"output_tokens_details":{"text_tokens":65},"total_tokens":589},"request_id":"5f79aafc-8749-4ea2-b122-7d8541d58b6c"}

0 commit comments

Comments
 (0)