From 61749d94ed5c058a110779887d33e6a53e738fc4 Mon Sep 17 00:00:00 2001 From: charlene tau express Date: Tue, 11 Mar 2025 17:49:59 +0800 Subject: [PATCH] first commit --- .env | 2 + .gitignore | 5 ++ Dockerfile | 0 requirements.txt | 46 ++++++++++++ src/app.py | 20 ++++++ src/services/embedding_service.py | 112 ++++++++++++++++++++++++++++++ 6 files changed, 185 insertions(+) create mode 100644 .env create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 requirements.txt create mode 100644 src/app.py create mode 100644 src/services/embedding_service.py diff --git a/.env b/.env new file mode 100644 index 0000000..b6e55fe --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +TOKENIZER_FOLDER=/home/charlenewsl/model-files/tokenizer +MODEL_FOLDER=/home/charlenewsl/model-files \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1b92115 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +myvenv/ +__pycache__/ +*.pyc +*.pyo +*.pyd \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..98fd90f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,46 @@ +blinker==1.9.0 +certifi==2025.1.31 +charset-normalizer==3.4.1 +click==8.1.8 +filelock==3.17.0 +Flask==3.1.0 +fsspec==2025.3.0 +huggingface-hub==0.29.2 +idna==3.10 +itsdangerous==2.2.0 +Jinja2==3.1.6 +joblib==1.4.2 +MarkupSafe==3.0.2 +mpmath==1.3.0 +networkx==3.4.2 +nltk==3.9.1 +numpy==2.2.3 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +packaging==24.2 +python-dotenv==1.0.1 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.3 +safetensors==0.5.3 +setuptools==76.0.0 +sympy==1.13.1 +tokenizers==0.21.0 +torch==2.6.0 +tqdm==4.67.1 +transformers==4.49.0 +triton==3.2.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +Werkzeug==3.1.3 diff --git a/src/app.py b/src/app.py new file mode 100644 index 0000000..7853068 --- /dev/null +++ b/src/app.py @@ -0,0 +1,20 @@ +from flask import Flask, jsonify +from dotenv import load_dotenv + + + +load_dotenv() +app = Flask(__name__) + +@app.route("/") +def hello(): + return jsonify({"message":"Hello, World!"}) + +@app.get("/") +def hello(): + return jsonify({"message":"Hello, World!"}) + +print(__name__) + +if __name__=="__main__": + app.run(debug=True, port=8000) \ No newline at end of file diff --git a/src/services/embedding_service.py b/src/services/embedding_service.py new file mode 100644 index 0000000..6af4210 --- /dev/null +++ b/src/services/embedding_service.py @@ -0,0 +1,112 @@ +import re + +import nltk +import numpy as np +import torch +import os +from transformers import AutoTokenizer, AutoModel + +# from app.settings import MODEL_FOLDER, TOKENIZER_FOLDER +from core.service import Component +TOKENIZER_FOLDER = os.getenv("TOKENIZER_FOLDER") +MODEL_FOLDER = os.getenv("MODEL_FOLDER") +class Embedding(Component): + + def __init__(self): + super().__init__() + self.tokenizer = None + self.model = None + + def load(self): + # Load model from HuggingFace Hub + # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2') + # model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2') + + # Load model from HuggingFace Hub + self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_FOLDER) + self.model = AutoModel.from_pretrained(MODEL_FOLDER) + + def process(self, input_clause, is_query=False): + return self.embedding_a_clause(input_clause, is_query) + + def embedding_a_clause(self, input_clause, is_query=False): + """ + embedding a clause: + 1. remove some symbols + 2. split long clauses to shorter segments + 3. for each short segment, embed it + """ + # preprocessing + segment_threshold = 100.0 + if is_query: + max_segment_len = 10000 + else: + max_segment_len = 200 + str = input_clause.strip() + """ + str = re.sub(r'\([a-z]\)\.?', '', str) + str = re.sub(r'\([ivx]+\)\.?', '', str) + if len(str) >= max_segment_len: + short_strs = nltk.sent_tokenize(str) + else: + short_strs = [str] + """ + str_list = str.split('\n') + short_strs = [] + for str_item in str_list: + str_item = re.sub(r'\([a-z]\)\.?', '', str_item) + str_item = re.sub(r'\([ivx]+\)\.?', '', str_item) + if len(str_item) >= max_segment_len: + short_strs.extend(nltk.sent_tokenize(str_item)) + else: + short_strs.append(str_item) + clause_segments = [] + temp_str = "" + for short_str in short_strs: + temp_str += ' ' + short_str + if len(temp_str) >= max_segment_len: + clause_segments.append(temp_str.strip()) + temp_str = "" + if len(temp_str) > 50: + clause_segments.append(temp_str.strip()) + else: + if len(clause_segments) > 0: + # too short, append it back to the last sentence + clause_segments[-1] = clause_segments[-1] + ' ' + temp_str + else: + clause_segments.append(temp_str) + + clause_vectors = self.embedding_sentences(clause_segments) + for i in range(len(clause_segments)): + if len(clause_segments[i]) < segment_threshold: + clause_vectors[i] = clause_vectors[i] * (0.6 + 0.4 * len(clause_segments[i]) / segment_threshold) + ### + clause_vectors[i] /= np.linalg.norm(clause_vectors[i]) + ### + return clause_vectors + + def embedding_sentences(self, input_sentences): + """ + :param input_sentences: a list of strings. each string may contain more than one sentences + each string should not be too long, say < 20 words, otherwise, the performance/similarity + evaluation may be bad as too many contents need to be mixed and represented by only one + vector + :return: + """ + # Tokenize sentences + encoded_input = self.tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt') + + # Compute token embeddings + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Perform pooling. In this case, max pooling. + sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask']) + + return sentence_embeddings # torch.Tensor, Each entry is a numpy vector (768 entries) + + def mean_pooling(self, model_output, attention_mask): + # Mean Pooling - Take PAD into account + token_embeddings = model_output[0] # First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)