superannotateai
diff --git a/‎readme.md‎
Lines changed: 2 additions & 8 deletions b/‎readme.md‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎superannotate-databricks-connector/superannotate_databricks_connector/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎superannotate-databricks-connector/superannotate_databricks_connector/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎superannotate-databricks-connector/superannotate_databricks_connector/schemas/text_schema.py‎
Lines changed: 55 additions & 0 deletions b/‎superannotate-databricks-connector/superannotate_databricks_connector/schemas/text_schema.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎superannotate-databricks-connector/superannotate_databricks_connector/text.py‎
Lines changed: 56 additions & 0 deletions b/‎superannotate-databricks-connector/superannotate_databricks_connector/text.py‎
Lines changed: 56 additions & 0 deletions
@@ -7,15 +7,14 @@ This Python package provides a set of utilities for working with SuperAnnotate d
 ### Features
 - Process SuperAnnotate vector instance data.
 - Convert processed data into a PySpark DataFrame.
-- Write DataFrame to a Delta table, creating the table if it does not exist, or updating the existing table with new data.
 
 
 ### Example notebooks.
 Copy the notebooks in the demo folder to your databricks workspace to get started with SuperAnnotate quickly!
 
 ### Installation
 ```bash
-pip install your-package-name
+pip install superannotate_databricks_connector
 ``` 
 
 ### Tests
@@ -29,7 +28,7 @@ docker build -f Dockerfile.test -t test_package .
 First import the required function
 
 ```python
-from your_package_name import get_vector_dataframe, write_annotations_to_delta
+from superannotate_databricks_conector.vector import get_vector_dataframe
 from superannotate import SAClient
 ```
 
@@ -41,8 +40,3 @@ annotations = sa.get_annotations("<PROJECT_NAME>)
 df = get_vector_dataframe(annotations, spark)
 ```
 
-Finally you can write the data frame to a delta table
-
-```python
-write_annotations_to_delta(df, "your_database", "your_table", spark)
-```
@@ -1,2 +0,0 @@
-from .vector import *
-from .schemas import *
@@ -0,0 +1,55 @@
+from pyspark.sql.types import (
+    StructType,
+    StructField,
+    StringType,
+    IntegerType,
+    MapType,
+    ArrayType,
+    TimestampType
+)
+
+
+def get_text_entity_schema():
+    return StructType([
+        StructField("id", StringType(), False),
+        StructField("start", IntegerType(), False),
+        StructField("end", IntegerType(), False),
+        StructField("classId", IntegerType(), False),
+        StructField("attributes", ArrayType(MapType(StringType(),
+                                                    StringType()))),
+        StructField("type", StringType(), False),
+        StructField("creationType", StringType(), True),
+        StructField("createdAt", TimestampType(), True),
+        StructField("createdBy", MapType(StringType(), StringType()), True),
+        StructField("updatedAt", TimestampType(), True),
+        StructField("updatedBy", MapType(StringType(), StringType()), True),
+        StructField("className", StringType(), False)
+    ])
+
+
+def get_text_tag_schema():
+    return StructType([
+        StructField("type", StringType(), True),
+        StructField("className", StringType(), True),
+        StructField("attributes", ArrayType(MapType(StringType(),
+                                                    StringType())),
+                    True)
+
+    ])
+
+
+def get_text_schema():
+    schema = StructType([
+        StructField("name", StringType(), True),
+        StructField("url", StringType(), True),
+        StructField("contentLength", IntegerType(), True),
+        StructField("projectId", IntegerType(), True),
+        StructField("status", StringType(), True),
+        StructField("annotatorEmail", StringType(), True),
+        StructField("qaEmail", StringType(), True),
+        StructField("entities", ArrayType(get_text_entity_schema()),
+                    True),
+        StructField("tags", ArrayType(get_text_tag_schema()),
+                    True)
+    ])
+    return schema
@@ -0,0 +1,56 @@
+from datetime import datetime
+from .schemas.text_schema import get_text_schema
+
+
+def convert_dates(instance):
+    """
+    Parses the date columns createdAt and updatedAt
+    in instances to datetime format
+
+    Args:
+        instance (dict): One annotation instance
+
+    Returns:
+        instance: The instance with date columns converted to
+                  datetime format.
+    """
+    instance["createdAt"] = datetime.strptime(
+        instance["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+    instance["updatedAt"] = datetime.strptime(
+        instance["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+    return instance
+
+
+def get_text_dataframe(annotations, spark):
+    """
+    Transforms a list of SuperAnnotate annotations from a text
+    project into a spark dataframe
+
+    Args:
+        annotations (list[dict]): The annotations in the SuperAnnotate format
+        spark (sparkContext): The spark context
+
+    Returns:
+        spark_df: A spark dataframe containing the annotations.
+
+    }
+    """
+    rows = []
+    for item in annotations:
+        flattened_item = {
+            "name": item["metadata"]["name"],
+            "url": item["metadata"]["url"],
+            "contentLength": item["metadata"]["contentLength"],
+            "projecId": item["metadata"]["projectId"],
+            "status": item["metadata"]["status"],
+            "annotatorEmail": item["metadata"]["annotatorEmail"],
+            "qaEmail": item["metadata"]["qaEmail"],
+            "entities": [convert_dates(instance) for instance 
+                         in item["instances"] if instance["type"] == "entity"],
+            "tags": [convert_dates(instance) for instance in item["instances"]
+                     if instance["type"] == "tag"]
+        }
+        rows.append(flattened_item)
+    schema = get_text_schema()
+    spark_df = spark.createDataFrame(rows, schema=schema)
+    return spark_df
Original file line number	Diff line number	Diff line change
`@@ -1,2 +0,0 @@`
`1`		`-from .vector import *`
`2`		`-from .schemas import *`