update tokenizers

This commit is contained in:
Nader Arbabian
2025-06-10 17:01:28 -07:00
committed by Nader Arbabian
parent d99adcfb36
commit 4bac805093
4 changed files with 15 additions and 15 deletions
+9 -9
View File
@@ -1,9 +1,9 @@
aiohttp==3.11.16 aiohttp~=3.11
anyio==4.4.0 anyio~=4.4
lib==4.0.0 lib~=4.0
nltk==3.9.1 nltk~=3.9
psutil==6.0.0 psutil~=6.0
pycryptodome==3.20.0 pycryptodome~=3.20
Requests==2.32.4 Requests~=2.32
transformers==4.50.0 transformers~=4.52
utils==1.0.2 utils~=1.0
+2 -2
View File
@@ -56,7 +56,7 @@ import dataclasses
import random import random
from typing import Dict, Any from typing import Dict, Any
from transformers import AutoTokenizer # used to count tokens in a prompt from transformers import OpenAIGPTTokenizer # used to count tokens in a prompt
import nltk # used to download a list of all words to generate a random prompt and benchmark the LLM model import nltk # used to download a list of all words to generate a random prompt and benchmark the LLM model
from lib.data_types import ApiPayload from lib.data_types import ApiPayload
@@ -65,7 +65,7 @@ nltk.download("words")
WORD_LIST = nltk.corpus.words.words() WORD_LIST = nltk.corpus.words.words()
# you can use any tokenizer that fits your LLM. `openai-gpt` is free to use and is a good fit for most LLMs # you can use any tokenizer that fits your LLM. `openai-gpt` is free to use and is a good fit for most LLMs
tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
@dataclasses.dataclass @dataclasses.dataclass
class InputData(ApiPayload): class InputData(ApiPayload):
+2 -2
View File
@@ -3,7 +3,7 @@ import random
import inspect import inspect
from typing import Dict, Any from typing import Dict, Any
from transformers import AutoTokenizer from transformers import OpenAIGPTTokenizer
import nltk import nltk
from lib.data_types import ApiPayload, JsonDataException from lib.data_types import ApiPayload, JsonDataException
@@ -12,7 +12,7 @@ nltk.download("words")
WORD_LIST = nltk.corpus.words.words() WORD_LIST = nltk.corpus.words.words()
# used to count to count tokens and workload for LLM # used to count to count tokens and workload for LLM
tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
@dataclasses.dataclass @dataclasses.dataclass
+2 -2
View File
@@ -3,7 +3,7 @@ import random
import inspect import inspect
from typing import Dict, Any from typing import Dict, Any
from transformers import AutoTokenizer from transformers import OpenAIGPTTokenizer
import nltk import nltk
from lib.data_types import ApiPayload, JsonDataException from lib.data_types import ApiPayload, JsonDataException
@@ -11,7 +11,7 @@ from lib.data_types import ApiPayload, JsonDataException
nltk.download("words") nltk.download("words")
WORD_LIST = nltk.corpus.words.words() WORD_LIST = nltk.corpus.words.words()
tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
@dataclasses.dataclass @dataclasses.dataclass