first commit
This commit is contained in:
commit
61749d94ed
2
.env
Normal file
2
.env
Normal file
@ -0,0 +1,2 @@
|
||||
TOKENIZER_FOLDER=/home/charlenewsl/model-files/tokenizer
|
||||
MODEL_FOLDER=/home/charlenewsl/model-files
|
||||
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
myvenv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
0
Dockerfile
Normal file
0
Dockerfile
Normal file
46
requirements.txt
Normal file
46
requirements.txt
Normal file
@ -0,0 +1,46 @@
|
||||
blinker==1.9.0
|
||||
certifi==2025.1.31
|
||||
charset-normalizer==3.4.1
|
||||
click==8.1.8
|
||||
filelock==3.17.0
|
||||
Flask==3.1.0
|
||||
fsspec==2025.3.0
|
||||
huggingface-hub==0.29.2
|
||||
idna==3.10
|
||||
itsdangerous==2.2.0
|
||||
Jinja2==3.1.6
|
||||
joblib==1.4.2
|
||||
MarkupSafe==3.0.2
|
||||
mpmath==1.3.0
|
||||
networkx==3.4.2
|
||||
nltk==3.9.1
|
||||
numpy==2.2.3
|
||||
nvidia-cublas-cu12==12.4.5.8
|
||||
nvidia-cuda-cupti-cu12==12.4.127
|
||||
nvidia-cuda-nvrtc-cu12==12.4.127
|
||||
nvidia-cuda-runtime-cu12==12.4.127
|
||||
nvidia-cudnn-cu12==9.1.0.70
|
||||
nvidia-cufft-cu12==11.2.1.3
|
||||
nvidia-curand-cu12==10.3.5.147
|
||||
nvidia-cusolver-cu12==11.6.1.9
|
||||
nvidia-cusparse-cu12==12.3.1.170
|
||||
nvidia-cusparselt-cu12==0.6.2
|
||||
nvidia-nccl-cu12==2.21.5
|
||||
nvidia-nvjitlink-cu12==12.4.127
|
||||
nvidia-nvtx-cu12==12.4.127
|
||||
packaging==24.2
|
||||
python-dotenv==1.0.1
|
||||
PyYAML==6.0.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
safetensors==0.5.3
|
||||
setuptools==76.0.0
|
||||
sympy==1.13.1
|
||||
tokenizers==0.21.0
|
||||
torch==2.6.0
|
||||
tqdm==4.67.1
|
||||
transformers==4.49.0
|
||||
triton==3.2.0
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.3.0
|
||||
Werkzeug==3.1.3
|
||||
20
src/app.py
Normal file
20
src/app.py
Normal file
@ -0,0 +1,20 @@
|
||||
from flask import Flask, jsonify
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
|
||||
load_dotenv()
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/")
|
||||
def hello():
|
||||
return jsonify({"message":"Hello, World!"})
|
||||
|
||||
@app.get("/")
|
||||
def hello():
|
||||
return jsonify({"message":"Hello, World!"})
|
||||
|
||||
print(__name__)
|
||||
|
||||
if __name__=="__main__":
|
||||
app.run(debug=True, port=8000)
|
||||
112
src/services/embedding_service.py
Normal file
112
src/services/embedding_service.py
Normal file
@ -0,0 +1,112 @@
|
||||
import re
|
||||
|
||||
import nltk
|
||||
import numpy as np
|
||||
import torch
|
||||
import os
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
|
||||
# from app.settings import MODEL_FOLDER, TOKENIZER_FOLDER
|
||||
from core.service import Component
|
||||
TOKENIZER_FOLDER = os.getenv("TOKENIZER_FOLDER")
|
||||
MODEL_FOLDER = os.getenv("MODEL_FOLDER")
|
||||
class Embedding(Component):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.tokenizer = None
|
||||
self.model = None
|
||||
|
||||
def load(self):
|
||||
# Load model from HuggingFace Hub
|
||||
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
|
||||
# model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
|
||||
|
||||
# Load model from HuggingFace Hub
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_FOLDER)
|
||||
self.model = AutoModel.from_pretrained(MODEL_FOLDER)
|
||||
|
||||
def process(self, input_clause, is_query=False):
|
||||
return self.embedding_a_clause(input_clause, is_query)
|
||||
|
||||
def embedding_a_clause(self, input_clause, is_query=False):
|
||||
"""
|
||||
embedding a clause:
|
||||
1. remove some symbols
|
||||
2. split long clauses to shorter segments
|
||||
3. for each short segment, embed it
|
||||
"""
|
||||
# preprocessing
|
||||
segment_threshold = 100.0
|
||||
if is_query:
|
||||
max_segment_len = 10000
|
||||
else:
|
||||
max_segment_len = 200
|
||||
str = input_clause.strip()
|
||||
"""
|
||||
str = re.sub(r'\([a-z]\)\.?', '', str)
|
||||
str = re.sub(r'\([ivx]+\)\.?', '', str)
|
||||
if len(str) >= max_segment_len:
|
||||
short_strs = nltk.sent_tokenize(str)
|
||||
else:
|
||||
short_strs = [str]
|
||||
"""
|
||||
str_list = str.split('\n')
|
||||
short_strs = []
|
||||
for str_item in str_list:
|
||||
str_item = re.sub(r'\([a-z]\)\.?', '', str_item)
|
||||
str_item = re.sub(r'\([ivx]+\)\.?', '', str_item)
|
||||
if len(str_item) >= max_segment_len:
|
||||
short_strs.extend(nltk.sent_tokenize(str_item))
|
||||
else:
|
||||
short_strs.append(str_item)
|
||||
clause_segments = []
|
||||
temp_str = ""
|
||||
for short_str in short_strs:
|
||||
temp_str += ' ' + short_str
|
||||
if len(temp_str) >= max_segment_len:
|
||||
clause_segments.append(temp_str.strip())
|
||||
temp_str = ""
|
||||
if len(temp_str) > 50:
|
||||
clause_segments.append(temp_str.strip())
|
||||
else:
|
||||
if len(clause_segments) > 0:
|
||||
# too short, append it back to the last sentence
|
||||
clause_segments[-1] = clause_segments[-1] + ' ' + temp_str
|
||||
else:
|
||||
clause_segments.append(temp_str)
|
||||
|
||||
clause_vectors = self.embedding_sentences(clause_segments)
|
||||
for i in range(len(clause_segments)):
|
||||
if len(clause_segments[i]) < segment_threshold:
|
||||
clause_vectors[i] = clause_vectors[i] * (0.6 + 0.4 * len(clause_segments[i]) / segment_threshold)
|
||||
###
|
||||
clause_vectors[i] /= np.linalg.norm(clause_vectors[i])
|
||||
###
|
||||
return clause_vectors
|
||||
|
||||
def embedding_sentences(self, input_sentences):
|
||||
"""
|
||||
:param input_sentences: a list of strings. each string may contain more than one sentences
|
||||
each string should not be too long, say < 20 words, otherwise, the performance/similarity
|
||||
evaluation may be bad as too many contents need to be mixed and represented by only one
|
||||
vector
|
||||
:return:
|
||||
"""
|
||||
# Tokenize sentences
|
||||
encoded_input = self.tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
|
||||
# Compute token embeddings
|
||||
with torch.no_grad():
|
||||
model_output = self.model(**encoded_input)
|
||||
|
||||
# Perform pooling. In this case, max pooling.
|
||||
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
|
||||
|
||||
return sentence_embeddings # torch.Tensor, Each entry is a numpy vector (768 entries)
|
||||
|
||||
def mean_pooling(self, model_output, attention_mask):
|
||||
# Mean Pooling - Take PAD into account
|
||||
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
||||
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
||||
Loading…
Reference in New Issue
Block a user