There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells":[{"cell_type":"markdown","metadata":{"id":"oNme5YN3mwRS"},"source":["# Install Required libraries"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"OmxFfI92lslA"},"outputs":[],"source":["!pip install sacremoses peft\n","! pip install -U transformers"]},{"cell_type":"markdown","metadata":{"id":"nxKi5cK6nU5Z"},"source":["# Import required libraries"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9Q4U4TWWnUNa"},"outputs":[],"source":["from transformers import AutoModelForSequenceClassification, pipeline, AutoModel, AutoTokenizer\n","import torch\n","import pandas as pd\n","import numpy as np\n"]},{"cell_type":"markdown","metadata":{"id":"nK9JropUm3yK"},"source":["# Import Data"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"pX9KCq5pSseO"},"outputs":[],"source":["df = pd.read_csv('---')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"lBfzey48Uufu"},"outputs":[],"source":["# Check for consensus and mark rows for removal\n","rows_to_drop = []\n","for index, row in df.iterrows():\n"," if row['Consensus'] == '-': #Drop unknown and unmarked\n"," rows_to_drop.append(index)\n","\n","# Drop the marked rows\n","df.drop(rows_to_drop, inplace=True)\n","\n","# Print the updated DataFrame\n","print(df)"]},{"cell_type":"markdown","metadata":{"id":"PEoE_tyrm7qS"},"source":["# Small data analysis"]},{"cell_type":"code","source":["model2 = AutoModel.from_pretrained(\"microsoft/biogpt\")\n"],"metadata":{"id":"qcUMnBKvpBkD"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"7h7jfsy9njIx"},"source":["# Load Model and tokenizer (BioGPT)"]},{"cell_type":"code","source":["model2"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qVU6_dzqpDIj","executionInfo":{"status":"ok","timestamp":1691338693871,"user_tz":-60,"elapsed":249,"user":{"displayName":"Unknown","userId":"16317712665857714848"}},"outputId":"2c33317d-025b-405a-f1a8-823854a44736"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["BioGptModel(\n"," (embed_tokens): Embedding(42384, 1024, padding_idx=1)\n"," (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)\n"," (layers): ModuleList(\n"," (0-23): 24 x BioGptDecoderLayer(\n"," (self_attn): BioGptAttention(\n"," (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," )\n"," (activation_fn): GELUActivation()\n"," (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n"," (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n"," (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n"," (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",")"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ACN8S3y_iS2r"},"outputs":[],"source":["from peft import (\n"," LoraConfig,\n"," PeftType,\n"," PromptEncoderConfig,\n"," PeftConfig,\n"," PeftModel,\n"," PeftModelForFeatureExtraction,\n"," PeftModelForSequenceClassification\n",")\n","\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","\n","\n","peft_model_id = \"Lukee4/biogpt-2019_2labels\"\n","config = PeftConfig.from_pretrained(peft_model_id)\n","tokenizer = AutoTokenizer.from_pretrained(\"microsoft/biogpt\")\n","config.auto_mapping= {'base_model_class': 'BioGptModel',\n"," 'parent_library': 'transformers.models.biogpt.modeling_biogpt'}\n","\n","model = AutoModel.from_pretrained(\"microsoft/biogpt\", output_hidden_states=True)\n","#AutoModelForSequenceClassification\n","\n","# Load the Lora model\n","inference_model = PeftModel.from_pretrained(model, peft_model_id, output_hidden_states=True)\n","#PeftModelForSequenceClassification"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1691332268554,"user":{"displayName":"Unknown","userId":"16317712665857714848"},"user_tz":-60},"id":"aZnqbrFp7NSI","outputId":"5d7429da-0944-4107-d3bc-3f3210f0960b"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["PeftConfig(peft_type='LORA', auto_mapping={'base_model_class': 'PeftModel', 'parent_library': 'peft.peft_model'}, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=True)"]},"metadata":{},"execution_count":20}],"source":["config"]},{"cell_type":"code","source":["model"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZAlYRMT2omiS","executionInfo":{"status":"ok","timestamp":1691338656263,"user_tz":-60,"elapsed":209,"user":{"displayName":"Unknown","userId":"16317712665857714848"}},"outputId":"fcc3b8b4-0f28-449b-9ebb-619f5d1be3ac"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["BioGptModel(\n"," (embed_tokens): Embedding(42384, 1024, padding_idx=1)\n"," (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)\n"," (layers): ModuleList(\n"," (0-23): 24 x BioGptDecoderLayer(\n"," (self_attn): BioGptAttention(\n"," (k_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (v_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (q_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," )\n"," (activation_fn): GELUActivation()\n"," (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n"," (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n"," (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n"," (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",")"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","execution_count":null,"metadata":{"id":"k5Tt5daq7pQv","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1691332656229,"user_tz":-60,"elapsed":209,"user":{"displayName":"Unknown","userId":"16317712665857714848"}},"outputId":"d0543a0d-ded5-4daf-a5f5-1d57d55f10a7"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["PeftModel(\n"," (base_model): LoraModel(\n"," (model): BioGptModel(\n"," (embed_tokens): Embedding(42384, 1024, padding_idx=1)\n"," (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)\n"," (layers): ModuleList(\n"," (0-23): 24 x BioGptDecoderLayer(\n"," (self_attn): BioGptAttention(\n"," (k_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (v_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (q_proj): Linear(\n"," in_features=1024, out_features=1024, bias=True\n"," (lora_dropout): ModuleDict(\n"," (default): Dropout(p=0.1, inplace=False)\n"," )\n"," (lora_A): ModuleDict(\n"," (default): Linear(in_features=1024, out_features=8, bias=False)\n"," )\n"," (lora_B): ModuleDict(\n"," (default): Linear(in_features=8, out_features=1024, bias=False)\n"," )\n"," (lora_embedding_A): ParameterDict()\n"," (lora_embedding_B): ParameterDict()\n"," )\n"," (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n"," )\n"," (activation_fn): GELUActivation()\n"," (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n"," (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n"," (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n"," (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"," )\n"," )\n",")"]},"metadata":{},"execution_count":23}],"source":["inference_model"]},{"cell_type":"markdown","metadata":{"id":"ppuc0YsrMwiA"},"source":["## Define the pipeline"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f9JuA7OTMuw-"},"outputs":[],"source":["# Create the pipeline\n","p = pipeline(\n"," task=\"feature-extraction\",\n"," tokenizer=tokenizer,\n"," model=inference_model,\n"," framework=\"pt\",\n"," device=-1, # use CUDA with 0\n",")"]},{"cell_type":"markdown","metadata":{"id":"dNTtyRnArThB"},"source":["### Get the embeddings of the last token of the last hidden state"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"8TnbZZQ3qqjo"},"outputs":[],"source":["# Custom function to extract the embedding of the last token\n","def extract_last_token(last_hidden_states):\n"," last_hidden_states = np.array(last_hidden_states)\n"," return last_hidden_states[:,-1, :]\n","\n","# Process the data using the pipeline\n","results = p([row[\"Chief Complaint\"] for _, row in df.iterrows()])\n","\n","# Extract the last token of the last hidden state\n","embeddings = [extract_last_token(hidden_state) for hidden_state in results]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZzokAh_VU5c4"},"outputs":[],"source":["# Assign the reshaped embeddings to the \"embeddings\" column in the DataFrame\n","df[\"embeddings_biogpt_tuned\"] = embeddings\n","\n","# Print the resulting DataFrame\n","print(df)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"8OxiKJdDNXI1"},"outputs":[],"source":["df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/2019_withFT.json', orient='records')"]}],"metadata":{"colab":{"provenance":[],"mount_file_id":"1zefsqyE0xTg_AKcXqJD_MzHpW1g7toaK","authorship_tag":"ABX9TyP1hQT8wEHK4xNkS0wr6ob6"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0} |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.