JUMP TO Getting started Installation guide Create an API key SDK concepts Client SDK best practices import/export Export overview Export v2 glossary Image Import image data Import image annotations Upload image predictions Export image annotations Video Import video data Import video annotations Upload video predictions Export video annotations Text Import text data Import text annotations Upload text predictions Export text annotations Geospatial (tiled imagery) Import geospatial data Import geospatial annotations Upload geospatial predictions Export geospatial annotations Documents Import document data Import document annotations Upload document prediction Export document annotations Conversational text Import conversational text data Import conversational text annotations Upload conversational text predictions Export conversational text annotations Audio Import audio data Import audio annotations Export audio annotations HTML Import HTML data Import HTML annotations Upload HTML predictions Export HTML annotations DICOM Import DICOM data Import DICOM annotations Export DICOM annotations integrations IAM integration Webhooks Databricks connector Snowflake connector Google BigQuery connector Hugging Face integration Schema Ontology Metadata Features Catalog Batch Dataset Data Row Attachments Media attributes Global keys Catalog Slice Custom embeddings Annotate Project Model Model Model Run Model training service API integration Model run config manage team Organization User Tutorials Import a labeled dataset (images) Import a labeled dataset (text) Convert to COCO format Work with tabular data Import a labeled dataset (text) Initialize Huggingface dataset, models and pipeline Python datasets = load_dataset("Babelscape/wikineural") tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER") model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER") ## Use GPU if it exists gpu_info = !nvidia-smi gpu_info = '\n'.join(gpu_info) if gpu_info.find('failed') >= 0: print('Not connected to a GPU') nlp = pipeline("ner", model=model, tokenizer=tokenizer) embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens') else: print(gpu_info) nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=0) embedding_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens', device=0) Setup client and configure parameters Python ## Enter your API key here LB_API_KEY = "YOUR_API_KEY" client = labelbox.Client(LB_API_KEY) ## Set batch size for batching data rows and annotation bulk import. 500-1000 is recommended size. BATCH_SIZE = 500 ## Set max number of data rows to import. WikiNeural dataset has ~1.1M data rows MAX_DATA_ROW_LIMIT = 2000 Functions Python def create_ner_objects(class_name, st, en): named_enity = TextEntity(start=st,end=en) named_enity_annotation = ObjectAnnotation(value=named_enity, name=class_name) return named_enity_annotation def generate_predictions(datarow): external_id = datarow["external_id"] dataset_name = external_id.split("_")[0] + "_" + external_id.split("_")[1] datarow_index = int(external_id.split("_")[2].split(".")[0]) uid = datarow['id'] text_data_row = datasets[dataset_name][datarow_index] tokens = text_data_row["tokens"] tokenized_input = tokenizer(tokens, is_split_into_words=True) sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True) annotations = [] ## Generate prediction predictions = nlp(sentence) ## process predictions and compute text entities for item in predictions: score = item['score'] if score > 0.99: entity = item['entity'] start = item['start'] end = item['end'] index = predictions.index(item) if entity =="B-PER": for next_item in predictions[index+1:]: if next_item['entity']=="I-PER": end = next_item['end'] else: break annotations.append(create_ner_objects("PER", start, end-1)) if entity =="B-ORG": for next_item in predictions[index+1:]: if next_item['entity']=="I-ORG": end = next_item['end'] else: break annotations.append(create_ner_objects("ORG", start, end-1)) if entity =="B-LOC": for next_item in predictions[index+1:]: if next_item['entity']=="I-LOC": end = next_item['end'] else: break annotations.append(create_ner_objects("LOC", start, end-1)) if entity =="B-MISC": for next_item in predictions[index+1:]: if next_item['entity']=="I-MISC": end = next_item['end'] else: break annotations.append(create_ner_objects("MISC", start, end-1)) except Exception as e: print(e) text_data = TextData(uid=uid) return text_data, annotations def create_data_rows_payload(payload): data_row_content = None label_content = None h, text_data_row, lang = payload file_name = lang + "_" + str(h) +'.txt' tokens = text_data_row["tokens"] tokenized_input = tokenizer(tokens, is_split_into_words=True) sentence = tokenizer.decode(tokenized_input["input_ids"], skip_special_tokens=True) embeddings = embedding_model.encode(sentence) embeddings_metadata = DataRowMetadataField( schema_id=embedding_field.uid, ## Labelbox currently only supports custom embedding of 128 max length value=embeddings[:128].tolist(), language_metadata = DataRowMetadataField( schema_id=language_field.uid, value=lang, metadata_payload = [language_metadata, embeddings_metadata] data_row_content = {DataRow.row_data: "gs://labelbox-datasets/wiki_neural_text_ner/"+file_name, DataRow.external_id: file_name, DataRow.metadata_fields: metadata_payload} except Exception as e: print(e) return data_row_content, label_content Create a dataset Python dataset = client.create_dataset(name="WikiNEuRal Text NER") Setup ontology 📘Create custom metadata field before proceedingName = language Kind = stringLearn more Python metadata_ontology = client.get_data_row_metadata_ontology() ## Create a custom metadata schema called language of string kind: https://docs.labelbox.com/docs/datarow-metadata#custom-fields language_field = metadata_ontology.custom_by_name["language"] embedding_field = metadata_ontology.reserved_by_name["embedding"] Python ontology = OntologyBuilder() PER = Tool(tool = Tool.Type.NER, name = "PER") ontology.add_tool(PER) ORG = Tool(tool = Tool.Type.NER, name = "ORG") ontology.add_tool(ORG) LOC = Tool(tool = Tool.Type.NER, name = "LOC") ontology.add_tool(LOC) MISC = Tool(tool = Tool.Type.NER, name = "MISC") ontology.add_tool(MISC) ontology = client.create_ontology("WikiNEuRal Text NER", ontology.asdict()) Setup a labeling project Python project = client.create_project(name = "WikiNEuRal Text NER", media_type=MediaType.Text) project.setup_editor(ontology) ontology_from_project = OntologyBuilder.from_project(project) project.update(queue_mode=project.QueueMode.Batch) Process and create data row payload in batches Python tuples = [] for item in datasets: # if item == "train_en": for h, text_data_row in enumerate(datasets[item]): tuples.append((h, text_data_row, item)) if MAX_DATA_ROW_LIMIT !=None: tuples = random.sample(tuples, MAX_DATA_ROW_LIMIT) Python chunked_tuples = list() for i in range(0, len(tuples), BATCH_SIZE): chunked_tuples.append(tuples[i:i+BATCH_SIZE]) Main iterator loop to import data Python for chunk in chunked_tuples: start_time = time.time() current_index = chunked_tuples.index(chunk) print("Executing {} of {} iteration".format(current_index, len(chunked_tuples))) ## Generate data row payload data_rows = [] for item in tqdm.tqdm(chunk): datarow,label = create_data_rows_payload(item) data_rows.append(datarow) ## Create data rows in Labelbox task = dataset.create_data_rows(data_rows) task.wait_till_done() print(task) ## Submit a batch of the recently created data rows batch_datarows = [] for item in task.result: batch_datarows.append(item['id']) batch = project.create_batch( str(current_index) + "_" + str(binascii.b2a_hex(os.urandom(5))), # name of the batch batch_datarows, # list of Data Rows 1 # priority between 1-5 ## Generate model predictions ground_truth_list = LabelList() results = [] for item in tqdm.tqdm(task.result): result = generate_predictions(item) ground_truth_list.append(Label( data=result[0], annotations = result[1] ## Convert model predictions to NDJSON format ground_truth_list.assign_feature_schema_ids(OntologyBuilder.from_project(project)) ground_truth_ndjson = list(NDJsonConverter.serialize(ground_truth_list)) ## Upload model predictions as ground truth upload_task = LabelImport.create_from_objects(client, project.uid, f"upload-job-{uuid4()}", ground_truth_ndjson) upload_task.wait_until_done() print(upload_task.errors) print(str((time.time() - start_time))+" seconds")