Skip to content

Commit c46300e

Browse files
author
Leo Lindén
authored
Merge pull request #1 from superannotateai/text_annotation
Text annotation
2 parents 0bf20bc + 2c8e052 commit c46300e

16 files changed

+464
-15
lines changed

readme.md

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,14 @@ This Python package provides a set of utilities for working with SuperAnnotate d
77
### Features
88
- Process SuperAnnotate vector instance data.
99
- Convert processed data into a PySpark DataFrame.
10-
- Write DataFrame to a Delta table, creating the table if it does not exist, or updating the existing table with new data.
1110

1211

1312
### Example notebooks.
1413
Copy the notebooks in the demo folder to your databricks workspace to get started with SuperAnnotate quickly!
1514

1615
### Installation
1716
```bash
18-
pip install your-package-name
17+
pip install superannotate_databricks_connector
1918
```
2019

2120
### Tests
@@ -29,7 +28,7 @@ docker build -f Dockerfile.test -t test_package .
2928
First import the required function
3029

3130
```python
32-
from your_package_name import get_vector_dataframe, write_annotations_to_delta
31+
from superannotate_databricks_conector.vector import get_vector_dataframe
3332
from superannotate import SAClient
3433
```
3534

@@ -41,8 +40,3 @@ annotations = sa.get_annotations("<PROJECT_NAME>)
4140
df = get_vector_dataframe(annotations, spark)
4241
```
4342

44-
Finally you can write the data frame to a delta table
45-
46-
```python
47-
write_annotations_to_delta(df, "your_database", "your_table", spark)
48-
```
Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +0,0 @@
1-
from .vector import *
2-
from .schemas import *
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from pyspark.sql.types import (
2+
StructType,
3+
StructField,
4+
StringType,
5+
IntegerType,
6+
MapType,
7+
ArrayType,
8+
TimestampType
9+
)
10+
11+
12+
def get_text_entity_schema():
13+
return StructType([
14+
StructField("id", StringType(), False),
15+
StructField("start", IntegerType(), False),
16+
StructField("end", IntegerType(), False),
17+
StructField("classId", IntegerType(), False),
18+
StructField("attributes", ArrayType(MapType(StringType(),
19+
StringType()))),
20+
StructField("type", StringType(), False),
21+
StructField("creationType", StringType(), True),
22+
StructField("createdAt", TimestampType(), True),
23+
StructField("createdBy", MapType(StringType(), StringType()), True),
24+
StructField("updatedAt", TimestampType(), True),
25+
StructField("updatedBy", MapType(StringType(), StringType()), True),
26+
StructField("className", StringType(), False)
27+
])
28+
29+
30+
def get_text_tag_schema():
31+
return StructType([
32+
StructField("type", StringType(), True),
33+
StructField("className", StringType(), True),
34+
StructField("attributes", ArrayType(MapType(StringType(),
35+
StringType())),
36+
True)
37+
38+
])
39+
40+
41+
def get_text_schema():
42+
schema = StructType([
43+
StructField("name", StringType(), True),
44+
StructField("url", StringType(), True),
45+
StructField("contentLength", IntegerType(), True),
46+
StructField("projectId", IntegerType(), True),
47+
StructField("status", StringType(), True),
48+
StructField("annotatorEmail", StringType(), True),
49+
StructField("qaEmail", StringType(), True),
50+
StructField("entities", ArrayType(get_text_entity_schema()),
51+
True),
52+
StructField("tags", ArrayType(get_text_tag_schema()),
53+
True)
54+
])
55+
return schema
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from datetime import datetime
2+
from .schemas.text_schema import get_text_schema
3+
4+
5+
def convert_dates(instance):
6+
"""
7+
Parses the date columns createdAt and updatedAt
8+
in instances to datetime format
9+
10+
Args:
11+
instance (dict): One annotation instance
12+
13+
Returns:
14+
instance: The instance with date columns converted to
15+
datetime format.
16+
"""
17+
instance["createdAt"] = datetime.strptime(
18+
instance["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
19+
instance["updatedAt"] = datetime.strptime(
20+
instance["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
21+
return instance
22+
23+
24+
def get_text_dataframe(annotations, spark):
25+
"""
26+
Transforms a list of SuperAnnotate annotations from a text
27+
project into a spark dataframe
28+
29+
Args:
30+
annotations (list[dict]): The annotations in the SuperAnnotate format
31+
spark (sparkContext): The spark context
32+
33+
Returns:
34+
spark_df: A spark dataframe containing the annotations.
35+
36+
}
37+
"""
38+
rows = []
39+
for item in annotations:
40+
flattened_item = {
41+
"name": item["metadata"]["name"],
42+
"url": item["metadata"]["url"],
43+
"contentLength": item["metadata"]["contentLength"],
44+
"projecId": item["metadata"]["projectId"],
45+
"status": item["metadata"]["status"],
46+
"annotatorEmail": item["metadata"]["annotatorEmail"],
47+
"qaEmail": item["metadata"]["qaEmail"],
48+
"entities": [convert_dates(instance) for instance
49+
in item["instances"] if instance["type"] == "entity"],
50+
"tags": [convert_dates(instance) for instance in item["instances"]
51+
if instance["type"] == "tag"]
52+
}
53+
rows.append(flattened_item)
54+
schema = get_text_schema()
55+
spark_df = spark.createDataFrame(rows, schema=schema)
56+
return spark_df

0 commit comments

Comments
 (0)