first commit
This commit is contained in:
commit
61749d94ed
2
.env
Normal file
2
.env
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
TOKENIZER_FOLDER=/home/charlenewsl/model-files/tokenizer
|
||||||
|
MODEL_FOLDER=/home/charlenewsl/model-files
|
||||||
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
myvenv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
0
Dockerfile
Normal file
0
Dockerfile
Normal file
46
requirements.txt
Normal file
46
requirements.txt
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
blinker==1.9.0
|
||||||
|
certifi==2025.1.31
|
||||||
|
charset-normalizer==3.4.1
|
||||||
|
click==8.1.8
|
||||||
|
filelock==3.17.0
|
||||||
|
Flask==3.1.0
|
||||||
|
fsspec==2025.3.0
|
||||||
|
huggingface-hub==0.29.2
|
||||||
|
idna==3.10
|
||||||
|
itsdangerous==2.2.0
|
||||||
|
Jinja2==3.1.6
|
||||||
|
joblib==1.4.2
|
||||||
|
MarkupSafe==3.0.2
|
||||||
|
mpmath==1.3.0
|
||||||
|
networkx==3.4.2
|
||||||
|
nltk==3.9.1
|
||||||
|
numpy==2.2.3
|
||||||
|
nvidia-cublas-cu12==12.4.5.8
|
||||||
|
nvidia-cuda-cupti-cu12==12.4.127
|
||||||
|
nvidia-cuda-nvrtc-cu12==12.4.127
|
||||||
|
nvidia-cuda-runtime-cu12==12.4.127
|
||||||
|
nvidia-cudnn-cu12==9.1.0.70
|
||||||
|
nvidia-cufft-cu12==11.2.1.3
|
||||||
|
nvidia-curand-cu12==10.3.5.147
|
||||||
|
nvidia-cusolver-cu12==11.6.1.9
|
||||||
|
nvidia-cusparse-cu12==12.3.1.170
|
||||||
|
nvidia-cusparselt-cu12==0.6.2
|
||||||
|
nvidia-nccl-cu12==2.21.5
|
||||||
|
nvidia-nvjitlink-cu12==12.4.127
|
||||||
|
nvidia-nvtx-cu12==12.4.127
|
||||||
|
packaging==24.2
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
PyYAML==6.0.2
|
||||||
|
regex==2024.11.6
|
||||||
|
requests==2.32.3
|
||||||
|
safetensors==0.5.3
|
||||||
|
setuptools==76.0.0
|
||||||
|
sympy==1.13.1
|
||||||
|
tokenizers==0.21.0
|
||||||
|
torch==2.6.0
|
||||||
|
tqdm==4.67.1
|
||||||
|
transformers==4.49.0
|
||||||
|
triton==3.2.0
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
urllib3==2.3.0
|
||||||
|
Werkzeug==3.1.3
|
||||||
20
src/app.py
Normal file
20
src/app.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from flask import Flask, jsonify
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def hello():
|
||||||
|
return jsonify({"message":"Hello, World!"})
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def hello():
|
||||||
|
return jsonify({"message":"Hello, World!"})
|
||||||
|
|
||||||
|
print(__name__)
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
app.run(debug=True, port=8000)
|
||||||
112
src/services/embedding_service.py
Normal file
112
src/services/embedding_service.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
|
||||||
|
# from app.settings import MODEL_FOLDER, TOKENIZER_FOLDER
|
||||||
|
from core.service import Component
|
||||||
|
TOKENIZER_FOLDER = os.getenv("TOKENIZER_FOLDER")
|
||||||
|
MODEL_FOLDER = os.getenv("MODEL_FOLDER")
|
||||||
|
class Embedding(Component):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.tokenizer = None
|
||||||
|
self.model = None
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
# Load model from HuggingFace Hub
|
||||||
|
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
|
||||||
|
# model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
|
||||||
|
|
||||||
|
# Load model from HuggingFace Hub
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_FOLDER)
|
||||||
|
self.model = AutoModel.from_pretrained(MODEL_FOLDER)
|
||||||
|
|
||||||
|
def process(self, input_clause, is_query=False):
|
||||||
|
return self.embedding_a_clause(input_clause, is_query)
|
||||||
|
|
||||||
|
def embedding_a_clause(self, input_clause, is_query=False):
|
||||||
|
"""
|
||||||
|
embedding a clause:
|
||||||
|
1. remove some symbols
|
||||||
|
2. split long clauses to shorter segments
|
||||||
|
3. for each short segment, embed it
|
||||||
|
"""
|
||||||
|
# preprocessing
|
||||||
|
segment_threshold = 100.0
|
||||||
|
if is_query:
|
||||||
|
max_segment_len = 10000
|
||||||
|
else:
|
||||||
|
max_segment_len = 200
|
||||||
|
str = input_clause.strip()
|
||||||
|
"""
|
||||||
|
str = re.sub(r'\([a-z]\)\.?', '', str)
|
||||||
|
str = re.sub(r'\([ivx]+\)\.?', '', str)
|
||||||
|
if len(str) >= max_segment_len:
|
||||||
|
short_strs = nltk.sent_tokenize(str)
|
||||||
|
else:
|
||||||
|
short_strs = [str]
|
||||||
|
"""
|
||||||
|
str_list = str.split('\n')
|
||||||
|
short_strs = []
|
||||||
|
for str_item in str_list:
|
||||||
|
str_item = re.sub(r'\([a-z]\)\.?', '', str_item)
|
||||||
|
str_item = re.sub(r'\([ivx]+\)\.?', '', str_item)
|
||||||
|
if len(str_item) >= max_segment_len:
|
||||||
|
short_strs.extend(nltk.sent_tokenize(str_item))
|
||||||
|
else:
|
||||||
|
short_strs.append(str_item)
|
||||||
|
clause_segments = []
|
||||||
|
temp_str = ""
|
||||||
|
for short_str in short_strs:
|
||||||
|
temp_str += ' ' + short_str
|
||||||
|
if len(temp_str) >= max_segment_len:
|
||||||
|
clause_segments.append(temp_str.strip())
|
||||||
|
temp_str = ""
|
||||||
|
if len(temp_str) > 50:
|
||||||
|
clause_segments.append(temp_str.strip())
|
||||||
|
else:
|
||||||
|
if len(clause_segments) > 0:
|
||||||
|
# too short, append it back to the last sentence
|
||||||
|
clause_segments[-1] = clause_segments[-1] + ' ' + temp_str
|
||||||
|
else:
|
||||||
|
clause_segments.append(temp_str)
|
||||||
|
|
||||||
|
clause_vectors = self.embedding_sentences(clause_segments)
|
||||||
|
for i in range(len(clause_segments)):
|
||||||
|
if len(clause_segments[i]) < segment_threshold:
|
||||||
|
clause_vectors[i] = clause_vectors[i] * (0.6 + 0.4 * len(clause_segments[i]) / segment_threshold)
|
||||||
|
###
|
||||||
|
clause_vectors[i] /= np.linalg.norm(clause_vectors[i])
|
||||||
|
###
|
||||||
|
return clause_vectors
|
||||||
|
|
||||||
|
def embedding_sentences(self, input_sentences):
|
||||||
|
"""
|
||||||
|
:param input_sentences: a list of strings. each string may contain more than one sentences
|
||||||
|
each string should not be too long, say < 20 words, otherwise, the performance/similarity
|
||||||
|
evaluation may be bad as too many contents need to be mixed and represented by only one
|
||||||
|
vector
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
# Tokenize sentences
|
||||||
|
encoded_input = self.tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt')
|
||||||
|
|
||||||
|
# Compute token embeddings
|
||||||
|
with torch.no_grad():
|
||||||
|
model_output = self.model(**encoded_input)
|
||||||
|
|
||||||
|
# Perform pooling. In this case, max pooling.
|
||||||
|
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
|
||||||
|
|
||||||
|
return sentence_embeddings # torch.Tensor, Each entry is a numpy vector (768 entries)
|
||||||
|
|
||||||
|
def mean_pooling(self, model_output, attention_mask):
|
||||||
|
# Mean Pooling - Take PAD into account
|
||||||
|
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
||||||
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||||
|
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
||||||
Loading…
Reference in New Issue
Block a user