first commit

This commit is contained in:
charlene tau express 2025-03-11 17:49:59 +08:00
commit 61749d94ed
6 changed files with 185 additions and 0 deletions

2
.env Normal file
View File

@ -0,0 +1,2 @@
TOKENIZER_FOLDER=/home/charlenewsl/model-files/tokenizer
MODEL_FOLDER=/home/charlenewsl/model-files

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
myvenv/
__pycache__/
*.pyc
*.pyo
*.pyd

0
Dockerfile Normal file
View File

46
requirements.txt Normal file
View File

@ -0,0 +1,46 @@
blinker==1.9.0
certifi==2025.1.31
charset-normalizer==3.4.1
click==8.1.8
filelock==3.17.0
Flask==3.1.0
fsspec==2025.3.0
huggingface-hub==0.29.2
idna==3.10
itsdangerous==2.2.0
Jinja2==3.1.6
joblib==1.4.2
MarkupSafe==3.0.2
mpmath==1.3.0
networkx==3.4.2
nltk==3.9.1
numpy==2.2.3
nvidia-cublas-cu12==12.4.5.8
nvidia-cuda-cupti-cu12==12.4.127
nvidia-cuda-nvrtc-cu12==12.4.127
nvidia-cuda-runtime-cu12==12.4.127
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.1.3
nvidia-curand-cu12==10.3.5.147
nvidia-cusolver-cu12==11.6.1.9
nvidia-cusparse-cu12==12.3.1.170
nvidia-cusparselt-cu12==0.6.2
nvidia-nccl-cu12==2.21.5
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.4.127
packaging==24.2
python-dotenv==1.0.1
PyYAML==6.0.2
regex==2024.11.6
requests==2.32.3
safetensors==0.5.3
setuptools==76.0.0
sympy==1.13.1
tokenizers==0.21.0
torch==2.6.0
tqdm==4.67.1
transformers==4.49.0
triton==3.2.0
typing_extensions==4.12.2
urllib3==2.3.0
Werkzeug==3.1.3

20
src/app.py Normal file
View File

@ -0,0 +1,20 @@
from flask import Flask, jsonify
from dotenv import load_dotenv
load_dotenv()
app = Flask(__name__)
@app.route("/")
def hello():
return jsonify({"message":"Hello, World!"})
@app.get("/")
def hello():
return jsonify({"message":"Hello, World!"})
print(__name__)
if __name__=="__main__":
app.run(debug=True, port=8000)

View File

@ -0,0 +1,112 @@
import re
import nltk
import numpy as np
import torch
import os
from transformers import AutoTokenizer, AutoModel
# from app.settings import MODEL_FOLDER, TOKENIZER_FOLDER
from core.service import Component
TOKENIZER_FOLDER = os.getenv("TOKENIZER_FOLDER")
MODEL_FOLDER = os.getenv("MODEL_FOLDER")
class Embedding(Component):
def __init__(self):
super().__init__()
self.tokenizer = None
self.model = None
def load(self):
# Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
# model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
# Load model from HuggingFace Hub
self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_FOLDER)
self.model = AutoModel.from_pretrained(MODEL_FOLDER)
def process(self, input_clause, is_query=False):
return self.embedding_a_clause(input_clause, is_query)
def embedding_a_clause(self, input_clause, is_query=False):
"""
embedding a clause:
1. remove some symbols
2. split long clauses to shorter segments
3. for each short segment, embed it
"""
# preprocessing
segment_threshold = 100.0
if is_query:
max_segment_len = 10000
else:
max_segment_len = 200
str = input_clause.strip()
"""
str = re.sub(r'\([a-z]\)\.?', '', str)
str = re.sub(r'\([ivx]+\)\.?', '', str)
if len(str) >= max_segment_len:
short_strs = nltk.sent_tokenize(str)
else:
short_strs = [str]
"""
str_list = str.split('\n')
short_strs = []
for str_item in str_list:
str_item = re.sub(r'\([a-z]\)\.?', '', str_item)
str_item = re.sub(r'\([ivx]+\)\.?', '', str_item)
if len(str_item) >= max_segment_len:
short_strs.extend(nltk.sent_tokenize(str_item))
else:
short_strs.append(str_item)
clause_segments = []
temp_str = ""
for short_str in short_strs:
temp_str += ' ' + short_str
if len(temp_str) >= max_segment_len:
clause_segments.append(temp_str.strip())
temp_str = ""
if len(temp_str) > 50:
clause_segments.append(temp_str.strip())
else:
if len(clause_segments) > 0:
# too short, append it back to the last sentence
clause_segments[-1] = clause_segments[-1] + ' ' + temp_str
else:
clause_segments.append(temp_str)
clause_vectors = self.embedding_sentences(clause_segments)
for i in range(len(clause_segments)):
if len(clause_segments[i]) < segment_threshold:
clause_vectors[i] = clause_vectors[i] * (0.6 + 0.4 * len(clause_segments[i]) / segment_threshold)
###
clause_vectors[i] /= np.linalg.norm(clause_vectors[i])
###
return clause_vectors
def embedding_sentences(self, input_sentences):
"""
:param input_sentences: a list of strings. each string may contain more than one sentences
each string should not be too long, say < 20 words, otherwise, the performance/similarity
evaluation may be bad as too many contents need to be mixed and represented by only one
vector
:return:
"""
# Tokenize sentences
encoded_input = self.tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
model_output = self.model(**encoded_input)
# Perform pooling. In this case, max pooling.
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
return sentence_embeddings # torch.Tensor, Each entry is a numpy vector (768 entries)
def mean_pooling(self, model_output, attention_mask):
# Mean Pooling - Take PAD into account
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)