Skip to content
Permalink
Browse files
Add files via upload
  • Loading branch information
lopesoll committed Aug 9, 2023
1 parent c3f73c7 commit 28881fc565e1bfb85a2f9b053b683a65b447d777
Show file tree
Hide file tree
Showing 15 changed files with 15 additions and 0 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -0,0 +1 @@
{"cells":[{"cell_type":"markdown","metadata":{"id":"oNme5YN3mwRS"},"source":["# Install Required libraries"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"OmxFfI92lslA"},"outputs":[],"source":["!pip install sacremoses peft\n","! pip install -U transformers"]},{"cell_type":"markdown","metadata":{"id":"nxKi5cK6nU5Z"},"source":["# Import required libraries"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9Q4U4TWWnUNa"},"outputs":[],"source":["from transformers import AutoModelForSequenceClassification, pipeline, AutoModel, AutoTokenizer\n","import torch\n","import pandas as pd\n","import numpy as np\n"]},{"cell_type":"markdown","metadata":{"id":"nK9JropUm3yK"},"source":["# Import Data"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"pX9KCq5pSseO"},"outputs":[],"source":["df = pd.read_csv('---')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"lBfzey48Uufu"},"outputs":[],"source":["# Check for consensus and mark rows for removal\n","rows_to_drop = []\n","for index, row in df.iterrows():\n"," if row['Consensus'] == '-': #Drop unknown and unmarked\n"," rows_to_drop.append(index)\n","\n","# Drop the marked rows\n","df.drop(rows_to_drop, inplace=True)\n","\n","# Print the updated DataFrame\n","print(df)"]},{"cell_type":"markdown","metadata":{"id":"PEoE_tyrm7qS"},"source":["# Small data analysis"]},{"cell_type":"code","source":["model2 = AutoModel.from_pretrained(\"microsoft/biogpt\")\n"],"metadata":{"id":"qcUMnBKvpBkD"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"7h7jfsy9njIx"},"source":["# Load Model and tokenizer (BioGPT)"]},{"cell_type":"code","source":["model2"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qVU6_dzqpDIj","executionInfo":{"status":"ok","timestamp":1691338693871,"user_tz":-60,"elapsed":249,"user":{"displayName":"Unknown","userId":"16317712665857714848"}},"outputId":"2c33317d-025b-405a-f1a8-823854a44736"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["BioGptModel(\n"," (embed_tokens): Embedding(42384, 1024, padding_idx=1)\n"," (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)\n"," (layers): ModuleList(\n"," (0-23): 24 x BioGptDecoderLayer(\n"," (self_attn): BioGptAttention(\n"," (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," )\n"," (activation_fn): GELUActivation()\n"," (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n"," (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n"," (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n"," (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",")"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ACN8S3y_iS2r"},"outputs":[],"source":["from peft import (\n"," LoraConfig,\n"," PeftType,\n"," PromptEncoderConfig,\n"," PeftConfig,\n"," PeftModel,\n"," PeftModelForFeatureExtraction,\n"," PeftModelForSequenceClassification\n",")\n","\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","\n","\n","peft_model_id = \"Lukee4/biogpt-2019_2labels\"\n","config = PeftConfig.from_pretrained(peft_model_id)\n","tokenizer = AutoTokenizer.from_pretrained(\"microsoft/biogpt\")\n","config.auto_mapping= {'base_model_class': 'BioGptModel',\n"," 'parent_library': 'transformers.models.biogpt.modeling_biogpt'}\n","\n","model = AutoModel.from_pretrained(\"microsoft/biogpt\", output_hidden_states=True)\n","#AutoModelForSequenceClassification\n","\n","# Load the Lora model\n","inference_model = PeftModel.from_pretrained(model, peft_model_id, output_hidden_states=True)\n","#PeftModelForSequenceClassification"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1691332268554,"user":{"displayName":"Unknown","userId":"16317712665857714848"},"user_tz":-60},"id":"aZnqbrFp7NSI","outputId":"5d7429da-0944-4107-d3bc-3f3210f0960b"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["PeftConfig(peft_type='LORA', auto_mapping={'base_model_class': 'PeftModel', 'parent_library': 'peft.peft_model'}, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=True)"]},"metadata":{},"execution_count":20}],"source":["config"]},{"cell_type":"code","source":["model"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZAlYRMT2omiS","executionInfo":{"status":"ok","timestamp":1691338656263,"user_tz":-60,"elapsed":209,"user":{"displayName":"Unknown","userId":"16317712665857714848"}},"outputId":"fcc3b8b4-0f28-449b-9ebb-619f5d1be3ac"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["BioGptModel(\n"," (embed_tokens): Embedding(42384, 1024, padding_idx=1)\n"," (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)\n"," (layers): ModuleList(\n"," (0-23): 24 x BioGptDecoderLayer(\n"," (self_attn): BioGptAttention(\n"," (k_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (v_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (q_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," )\n"," (activation_fn): GELUActivation()\n"," (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n"," (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n"," (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n"," (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",")"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","execution_count":null,"metadata":{"id":"k5Tt5daq7pQv","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1691332656229,"user_tz":-60,"elapsed":209,"user":{"displayName":"Unknown","userId":"16317712665857714848"}},"outputId":"d0543a0d-ded5-4daf-a5f5-1d57d55f10a7"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["PeftModel(\n"," (base_model): LoraModel(\n"," (model): BioGptModel(\n"," (embed_tokens): Embedding(42384, 1024, padding_idx=1)\n"," (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)\n"," (layers): ModuleList(\n"," (0-23): 24 x BioGptDecoderLayer(\n"," (self_attn): BioGptAttention(\n"," (k_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (v_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (q_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," )\n"," (activation_fn): GELUActivation()\n"," (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n"," (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n"," (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n"," (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n",")"]},"metadata":{},"execution_count":23}],"source":["inference_model"]},{"cell_type":"markdown","metadata":{"id":"ppuc0YsrMwiA"},"source":["## Define the pipeline"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f9JuA7OTMuw-"},"outputs":[],"source":["# Create the pipeline\n","p = pipeline(\n"," task=\"feature-extraction\",\n"," tokenizer=tokenizer,\n"," model=inference_model,\n"," framework=\"pt\",\n"," device=-1, # use CUDA with 0\n",")"]},{"cell_type":"markdown","metadata":{"id":"dNTtyRnArThB"},"source":["### Get the embeddings of the last token of the last hidden state"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"8TnbZZQ3qqjo"},"outputs":[],"source":["# Custom function to extract the embedding of the last token\n","def extract_last_token(last_hidden_states):\n"," last_hidden_states = np.array(last_hidden_states)\n"," return last_hidden_states[:,-1, :]\n","\n","# Process the data using the pipeline\n","results = p([row[\"Chief Complaint\"] for _, row in df.iterrows()])\n","\n","# Extract the last token of the last hidden state\n","embeddings = [extract_last_token(hidden_state) for hidden_state in results]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZzokAh_VU5c4"},"outputs":[],"source":["# Assign the reshaped embeddings to the \"embeddings\" column in the DataFrame\n","df[\"embeddings_biogpt_tuned\"] = embeddings\n","\n","# Print the resulting DataFrame\n","print(df)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"8OxiKJdDNXI1"},"outputs":[],"source":["df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/2019_withFT.json', orient='records')"]}],"metadata":{"colab":{"provenance":[],"mount_file_id":"1zefsqyE0xTg_AKcXqJD_MzHpW1g7toaK","authorship_tag":"ABX9TyP1hQT8wEHK4xNkS0wr6ob6"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

0 comments on commit 28881fc

Please sign in to comment.