first commit

2025-03-11 17:49:59 +08:00 · 2025-03-11 17:49:59 +08:00 · 61749d94ed
commit 61749d94ed
6 changed files with 185 additions and 0 deletions
--- a/.env
+++ b/.env
@ -0,0 +1,2 @@
+TOKENIZER_FOLDER=/home/charlenewsl/model-files/tokenizer
+MODEL_FOLDER=/home/charlenewsl/model-files
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+myvenv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
--- a/0
+++ b/0
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,46 @@
+blinker==1.9.0
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+filelock==3.17.0
+Flask==3.1.0
+fsspec==2025.3.0
+huggingface-hub==0.29.2
+idna==3.10
+itsdangerous==2.2.0
+Jinja2==3.1.6
+joblib==1.4.2
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==2.2.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+packaging==24.2
+python-dotenv==1.0.1
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.5.3
+setuptools==76.0.0
+sympy==1.13.1
+tokenizers==0.21.0
+torch==2.6.0
+tqdm==4.67.1
+transformers==4.49.0
+triton==3.2.0
+typing_extensions==4.12.2
+urllib3==2.3.0
+Werkzeug==3.1.3
--- a/src/app.py
+++ b/src/app.py
@ -0,0 +1,20 @@
+from flask import Flask, jsonify
+from dotenv import load_dotenv
+
+
+
+load_dotenv()
+app = Flask(__name__)
+
+@app.route("/")
+def hello():
+    return jsonify({"message":"Hello, World!"})
+
+@app.get("/")
+def hello():
+    return jsonify({"message":"Hello, World!"})
+
+print(__name__)
+
+if __name__=="__main__":
+    app.run(debug=True, port=8000)
--- a/src/services/embedding_service.py
+++ b/src/services/embedding_service.py
@ -0,0 +1,112 @@
+import re
+
+import nltk
+import numpy as np
+import torch
+import os
+from transformers import AutoTokenizer, AutoModel
+
+# from app.settings import MODEL_FOLDER, TOKENIZER_FOLDER
+from core.service import Component
+TOKENIZER_FOLDER = os.getenv("TOKENIZER_FOLDER")
+MODEL_FOLDER = os.getenv("MODEL_FOLDER")
+class Embedding(Component):
+
+    def __init__(self):
+        super().__init__()
+        self.tokenizer = None
+        self.model = None
+
+    def load(self):
+        # Load model from HuggingFace Hub
+        # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
+        # model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilbert-base-v2')
+
+        # Load model from HuggingFace Hub
+        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_FOLDER)
+        self.model = AutoModel.from_pretrained(MODEL_FOLDER)
+
+    def process(self, input_clause, is_query=False):
+        return self.embedding_a_clause(input_clause, is_query)
+
+    def embedding_a_clause(self, input_clause, is_query=False):
+        """
+        embedding a clause:
+          1. remove some symbols
+          2. split long clauses to shorter segments
+          3. for each short segment, embed it
+        """
+        # preprocessing
+        segment_threshold = 100.0
+        if is_query:
+            max_segment_len = 10000
+        else:
+            max_segment_len = 200
+        str = input_clause.strip()
+        """
+        str = re.sub(r'\([a-z]\)\.?', '', str)
+        str = re.sub(r'\([ivx]+\)\.?', '', str)
+        if len(str) >= max_segment_len:
+            short_strs = nltk.sent_tokenize(str)
+        else:
+            short_strs = [str]
+        """
+        str_list = str.split('\n')
+        short_strs = []
+        for str_item in str_list:
+            str_item = re.sub(r'\([a-z]\)\.?', '', str_item)
+            str_item = re.sub(r'\([ivx]+\)\.?', '', str_item)
+            if len(str_item) >= max_segment_len:
+                short_strs.extend(nltk.sent_tokenize(str_item))
+            else:
+                short_strs.append(str_item)
+        clause_segments = []
+        temp_str = ""
+        for short_str in short_strs:
+            temp_str += ' ' + short_str
+            if len(temp_str) >= max_segment_len:
+                clause_segments.append(temp_str.strip())
+                temp_str = ""
+        if len(temp_str) > 50:
+            clause_segments.append(temp_str.strip())
+        else:
+            if len(clause_segments) > 0:
+                # too short, append it back to the last sentence
+                clause_segments[-1] = clause_segments[-1] + ' ' + temp_str
+            else:
+                clause_segments.append(temp_str)
+
+        clause_vectors = self.embedding_sentences(clause_segments)
+        for i in range(len(clause_segments)):
+            if len(clause_segments[i]) < segment_threshold:
+                clause_vectors[i] = clause_vectors[i] * (0.6 + 0.4 * len(clause_segments[i]) / segment_threshold)
+            ###
+            clause_vectors[i] /= np.linalg.norm(clause_vectors[i])
+            ###
+        return clause_vectors
+
+    def embedding_sentences(self, input_sentences):
+        """
+        :param input_sentences: a list of strings. each string may contain more than one sentences
+            each string should not be too long, say < 20 words, otherwise, the performance/similarity
+            evaluation may be bad as too many contents need to be mixed and represented by only one
+            vector
+        :return:
+        """
+        # Tokenize sentences
+        encoded_input = self.tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt')
+
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+
+        # Perform pooling. In this case, max pooling.
+        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
+
+        return sentence_embeddings  # torch.Tensor, Each entry is a numpy vector (768 entries)
+
+    def mean_pooling(self, model_output, attention_mask):
+        # Mean Pooling - Take PAD into account
+        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)