Source code for towhee.models.clip4clip.utils

import regex as re
import numpy as np
from typing import List
from towhee.models.clip.simple_tokenizer import SimpleTokenizer, whitespace_clean, basic_clean


[docs]def tokenize(text: str) -> List: """ Use SimpleTokenizer to tokenize text. Args: text (`str`): Text to tokenize Returns: Tokenized infos. """ tokenizer = SimpleTokenizer() tokens = [] text = whitespace_clean(basic_clean(text)).lower() for token in re.findall(tokenizer.pat, text): token = "".join(tokenizer.byte_encoder[b] for b in token.encode("utf-8")) tokens.extend(bpe_token for bpe_token in tokenizer.bpe(token).split(" ")) return tokens
[docs]def convert_tokens_to_id(tokenizer: SimpleTokenizer, words: str, max_words: int = 32) -> np.ndarray: """ Convert tokens to token ID. Args: tokenizer (`SimpleTokenizer`): SimpleTokenizer instance. words (`str`): Raw text words. max_words (`int`): Max mord length, if not enough, the output ID is 0. Returns: Ndarray of ID list. """ special_token = {"CLS_TOKEN": "<|startoftext|>", "SEP_TOKEN": "<|endoftext|>", "MASK_TOKEN": "[MASK]", "UNK_TOKEN": "[UNK]", "PAD_TOKEN": "[PAD]"} pairs_text = np.zeros((1, max_words), dtype=np.long) words = tokenize(words) words = [special_token["CLS_TOKEN"]] + words total_length_with_cls = max_words - 1 if len(words) > total_length_with_cls: words = words[:total_length_with_cls] words = words + [special_token["SEP_TOKEN"]] input_ids = [tokenizer.encoder[bpe_token] for bpe_token in words] while len(input_ids) < max_words: input_ids.append(0) assert len(input_ids) == max_words pairs_text[0] = np.array(input_ids) return pairs_text