%%capture
# Install Dask or PyArrow for large dataset handling (optional)
%pip install dask pyarrow.

# Install TensorFlow and Keras for LSTM
%pip install tensorflow

# Install Hugging Face Transformers for FinBERT
%pip install transformers

# Install Torch for FinBERT model
%pip install torch

# Install NumPy and Pandas for data manipulation
%pip install numpy pandas

# Install Scikit-learn for data preprocessing and evaluation metrics
%pip install scikit-learn

# Install Matplotlib and Seaborn for data visualization
%pip install matplotlib seaborn

# Install Jupyter Notebook extensions (optional, for enhanced notebook features)
%pip install jupyter_contrib_nbextensions

# Install NLTK for text processing
%pip install nltk

# Install yfinance for stock data
%pip install yfinance

# Ensure TensorFlow is installed
%pip install tensorflow

# Install pandas_market_calendars for market calendars
%pip install pandas-market-calendars

# install wordcloud for word cloud visualization
%pip install wordcloud

# install scipy for scientific computing
%pip install scipy==1.10.1

# install gensim for word2vec
%pip install gensim

# install pandas_datareader for stock data
%pip install pandas_datareader

%pip install --upgrade typing_extensions

%%capture
# Import necessary libraries for the project
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('all')
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_market_calendars as mcal
import gensim
import os
import matplotlib.pyplot as plt
import seaborn as sns

IMPORT_FILES = True
TEST = False

import os
import pandas as pd

# Define the file path
file_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/all-the-news-2-1.csv"

# Check if the file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at {file_path}. Ensure the dataset is downloaded and placed correctly.")

# Load the dataset
news_df = pd.read_csv(file_path, low_memory=False)

# Build the date column by combining year, month, and day
news_df['date'] = pd.to_datetime(news_df[['year', 'month', 'day']])

# Convert to string format YYYY-MM-DD
news_df['date'] = news_df['date'].dt.strftime('%Y-%m-%d')

# Rename the column to 'Date'
news_df.rename(columns={'date': 'Date'}, inplace=True)

# Define the date range
start_date = "2016-01-01"
end_date = "2020-04-02"

# Filter the dataset for the date range
filtered_news = news_df.loc[(news_df['Date'] >= start_date) & (news_df['Date'] <= end_date)]

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Unified plotting function
def plot_columns(data):
    num_cols = len(data.columns)
    rows = (num_cols // 4) + (num_cols % 4 > 0)  # Create enough rows for 5 columns per row
    fig, axes = plt.subplots(rows, 4, figsize=(20, 12))
    axes = axes.flatten()
    
    for i, column in enumerate(data.columns):
        unique = data[column].nunique()
        missing = data[column].isnull().mean() * 100
        
        ax = axes[i]
        if pd.api.types.is_numeric_dtype(data[column]):
            sns.histplot(data[column].dropna(), kde=False, bins=30, ax=ax)
            ax.set_title(f"{column} | Unique: {unique}, Missing: {missing:.2f}%, Min: {data[column].min()}, Max: {data[column].max()}")
        elif column in ["publication"]:
            plot = sns.countplot(y=data[column], order=data[column].value_counts().index, ax=ax)
            plot.bar_label(plot.containers[0])
            ax.set_title(f"{column} | Unique: {unique}, Missing: {missing:.2f}%")
        else:
            dupes, nulls = data.duplicated(subset=[column]).sum(), data[column].isnull().sum()
            plot = sns.barplot(x=["Duplications", "Missing"], y=[dupes, nulls], ax=ax)
            plot.bar_label(plot.containers[0])
            ax.set_title(f"{column} | Duplications & Missing")
        ax.set_ylabel(column)
        ax.set_xlabel("Count")
        
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])  # Remove unused subplots
    
    plt.tight_layout()
    plt.show()

# Execute plotting
plot_columns(filtered_news)

import gc

# Remove the duplicates by missing by policy provided above
before = len(filtered_news)

# MISSING VALUES 
# Remove the rows with missing values unless the column is an author or section
filtered_news = filtered_news.dropna(subset=['title', 'article', 'Date'])

# Remove the duplicate rows
filtered_news = filtered_news.drop_duplicates()

# Remove the duplicates from the filtered news dataset based on the title column and keep the first occurrence 
filtered_news = filtered_news.drop_duplicates(subset='title', keep='first')

# Now that we have found and removed all the duplicates, we can remove unnecessary columns, i.e., ones that will not be used in the analysis
filtered_news = filtered_news.drop(columns=['year', 'author', 'url', 'section', 'publication'])

after = len(filtered_news)

print(f"We have deleted {before - after} rows and we currently have {after} rows in our DataFrame")
_ = gc.collect()  # Assign to underscore to suppress output

We have deleted 276857 rows and we currently have 2412021 rows in our DataFrame

# Define financial keywords
keywords = [
        'AAPL', 'Apple', 'Apple Inc.', 'iPhone', 'MacBook', 'iPad', 'Apple stock',
        'AMZN', 'Amazon', 'Amazon Inc.', 'AWS', 'Prime', 'Amazon stock'
        'NFLX', 'Netflix', 'Netflix Originals', 'Streaming', 'Netflix stock'
        'stock', 'stocks', 'shares', 'equity', 'price movement', 'market trends',
        'bull market', 'bear market', 'market volatility', 'earnings report',
        'quarterly results', 'forecast', 'valuation', 'trading volume', 'short squeeze',
        'revenue', 'profits', 'losses', 'dividends', 'growth', 'inflation',
        'recession', 'GDP', 'interest rates', 'policy changes'
]

# Create a regex pattern
keyword_pattern = '|'.join(keywords)

# Filter articles based on keywords
filtered_news = filtered_news[
    filtered_news['title'].str.contains(keyword_pattern, case=False, na=False) |
    filtered_news['article'].str.contains(keyword_pattern, case=False, na=False)
]

# Further refine by ensuring both title and article contain financial terms
filtered_news = filtered_news[
    (filtered_news['title'].str.contains(keyword_pattern, case=False, na=False)) &
    (filtered_news['article'].str.contains(keyword_pattern, case=False, na=False))
]

# Save the financial news dataset
# filtered_news_file_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/filtered_news.csv"
# filtered_news.to_csv(filtered_news_file_path, index=False)
data_head = filtered_news.head(10).copy()

# Truncate the strings in 'title' and 'article' columns
data_head['title'] = data_head['title'].apply(lambda x: x[:15] + '...' if len(x) > 15 else x)
data_head['article'] = data_head['article'].apply(lambda x: x[:15] + '...' if len(x) > 15 else x)

# Initialize a 'Ticker' column with a default value of 'General'
filtered_news['Ticker'] = 'General'

# Define keyword groups for categorization
keywords = {
    'AAPL': ['AAPL', 'Apple', 'Apple Inc.', 'iPhone', 'MacBook', 'iPad', 'Apple stock'],
    'AMZN': ['AMZN', 'Amazon', 'Amazon Inc.', 'AWS', 'Prime', 'Amazon stock'],
    'NFLX': ['NFLX', 'Netflix', 'Netflix Originals', 'Streaming', 'Netflix stock'],
    'General': [
        'stock', 'stocks', 'shares', 'equity', 'price movement', 'market trends',
        'bull market', 'bear market', 'market volatility', 'earnings report',
        'quarterly results', 'forecast', 'valuation', 'trading volume', 'short squeeze',
        'revenue', 'profits', 'losses', 'dividends', 'growth', 'inflation',
        'recession', 'GDP', 'interest rates', 'policy changes'
    ]
}

# Function to categorize articles based on prioritization
def categorize_article(row):
    # Combine 'title' and 'article' for comprehensive keyword matching
    title = row['title'].lower() if isinstance(row['title'], str) else ''
    article = row['article'].lower() if isinstance(row['article'], str) else ''

    # Check title first
    for group, group_keywords in keywords.items():
        if any(keyword.lower() in title for keyword in group_keywords):
            # Return the first matching group unless it is 'General'
            if group != 'General':
                return group

    # If no match in title, check article
    for group, group_keywords in keywords.items():
        if any(keyword.lower() in article for keyword in group_keywords):
            return group

    # Default to 'General' if no match
    return 'General'

# Apply the categorization function to each row
filtered_news['Ticker'] = filtered_news.apply(categorize_article, axis=1)

# Save the categorized news to a CSV file
# categorized_news_file_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/categorized_news.csv"
# filtered_news.to_csv(categorized_news_file_path, index=False)

# plot the distribution of the news articles by group with labels
plt.figure(figsize=(12, 8))
plot = sns.countplot(x=filtered_news['Ticker'], order=filtered_news['Ticker'].value_counts().index)
plot.bar_label(plot.containers[0])
plt.title("Distribution of News Articles by Ticker")
plt.xlabel("Ticker")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

import pandas as pd
import pandas_market_calendars as mcal
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Define the NYSE calendar
nyse = mcal.get_calendar('NYSE')

# Define date range
start_date = "2016-01-01"
end_date = "2020-04-02"

# Convert 'Date' column to datetime format
filtered_news['Date'] = pd.to_datetime(filtered_news['Date'])

# Check if any rows in 'filtered_news' are outside the valid date range
print(f"Filtered news date range: {filtered_news['Date'].min()} to {filtered_news['Date'].max()}")

# Group by 'Date' and 'Ticker' to count articles per day per stock
news_counts = filtered_news.groupby(['Date', 'Ticker']).size().unstack(fill_value=0)

# Get valid trading days from the NYSE calendar
trading_days = pd.to_datetime(nyse.valid_days(start_date=start_date, end_date=end_date))

# Remove any timezone information from trading_days to match filtered_news dates
trading_days = trading_days.tz_localize(None)

# Reindex to ensure all trading days are present (fill missing with 0)
news_counts = news_counts.reindex(trading_days, fill_value=0)

# Remove "noisy" data: Set values greater than 150 to NaN (or 0)
news_counts[news_counts > 150] = 0

# Count the number of days with 0 articles for at least one ticker
days_with_no_articles = (news_counts == 0).any(axis=1).sum()

# Print the count of days with zero articles for at least one ticker
print(f"Number of days with zero articles for at least one ticker: {days_with_no_articles}")

# Get tickers
tickers = news_counts.columns.tolist()

# Set up subplot grid (2x2 for up to 4 tickers)
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))
axes = axes.flatten()

# Iterate over tickers and plot
for i, ticker in enumerate(tickers[:4]):  # Ensure we only plot up to 4 tickers
    ax = axes[i]
    sns.histplot(news_counts[ticker], bins=20, kde=False, ax=ax, color='blue')
    
    # Calculate statistics
    avg = news_counts[ticker].mean()
    median = news_counts[ticker].median()
    min_articles = news_counts[ticker].min()
    max_articles = news_counts[ticker].max()
    days_below_avg = (news_counts[ticker] < avg).sum()
    days_below_median = (news_counts[ticker] < median).sum()
    
    # Plot vertical lines for average and median
    ax.axvline(avg, color='red', linestyle='--', label=f'Avg: {avg:.1f}')
    ax.axvline(median, color='green', linestyle='--', label=f'Median: {median:.1f}')
    
    # Labels and title
    ax.set_title(f"News Articles per Day - {ticker}")
    ax.set_xlabel("Number of Articles")
    ax.set_ylabel("Trading Days")
    ax.legend()
    
    # Print stats
    print(f"{ticker} - Days below Avg: {days_below_avg}, Days below Median: {days_below_median}, Min: {min_articles}, Max: {max_articles}")

# Remove unused subplots if there are fewer than 4 tickers
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

Filtered news date range: 2016-01-01 00:00:00 to 2020-04-01 00:00:00
Number of days with zero articles for at least one ticker: 26
AAPL - Days below Avg: 656, Days below Median: 506, Min: 0, Max: 143
AMZN - Days below Avg: 626, Days below Median: 521, Min: 0, Max: 138
General - Days below Avg: 559, Days below Median: 533, Min: 0, Max: 150
NFLX - Days below Avg: 657, Days below Median: 500, Min: 0, Max: 50

import gc
import pandas as pd

max_sample = 1 if TEST else 20

before = len(filtered_news)
    
# Count news articles per stock per day
news_counts = filtered_news.groupby(['Date', 'Ticker']).size().reset_index(name='count')

# Find the minimum news count on any day across all stocks
min_news_per_day = news_counts['count'].min()
min_news_date = news_counts[news_counts['count'] == min_news_per_day]['Date'].unique()

# Count how many stock-days have more than this minimum
above_min_count = (news_counts['count'] > min_news_per_day).sum()

# Apply filtering: randomly sample up to max_sample articles per date per ticker
filtered_news = (
    filtered_news.groupby(['Date', 'Ticker'], group_keys=False)
    .apply(lambda x: x.drop(columns=['Date', 'Ticker']).sample(n=min(len(x), max_sample), random_state=42))
    .reset_index())


# Free memory
gc.collect()

after = len(filtered_news)

print(f"We have deleted {before - after} rows and we currently have {after} rows in our DataFrame")

We have deleted 84998 rows and we currently have 80367 rows in our DataFrame

C:\Users\argam\AppData\Local\Temp\ipykernel_17448\4029723487.py:21: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: x.drop(columns=['Date', 'Ticker']).sample(n=min(len(x), max_sample), random_state=42))

from nltk.tokenize import word_tokenize , RegexpTokenizer

# Normalize text
def normalize_text(text):
    return re.sub(r'[^a-z\s]', '', text.lower())

filtered_news['title'] = filtered_news['title'].apply(normalize_text)
filtered_news['article'] = filtered_news['article'].apply(normalize_text)

# Tokenize text
filtered_news['title'] = filtered_news['title'].apply(word_tokenize)
filtered_news['article'] = filtered_news['article'].apply(word_tokenize)

# Define custom stop words
financial_stopwords = {
    'aapl', 'apple', 'amzn', 'amazon', 'nflx', 'netflix', 'stock', 'shares',
    'market', 'growth', 'earnings', 'trading', 'revenue', 'profit', 'economy',
    'price', 'forecast', 'volatility', 'dividends', 'inflation', 'gdp',
    'interest', 'valuation'
}

general_stopwords = set(stopwords.words('english'))
all_stopwords = general_stopwords - financial_stopwords

# Remove stop words
def remove_stopwords(tokens):
    return [word for word in tokens if word not in all_stopwords]

filtered_news['article'] = filtered_news['article'].apply(remove_stopwords)

# Lemmatize text
lemmatizer = nltk.WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

filtered_news['article'] = filtered_news['article'].apply(lemmatize_tokens)

# Generate word clouds
def generate_wordcloud(text, title, ax):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title)

all_titles = ' '.join([' '.join(tokens) for tokens in filtered_news['title']])
all_articles = ' '.join([' '.join(tokens) for tokens in filtered_news['article']])

fig, axs = plt.subplots(1, 2, figsize=(24, 12))

generate_wordcloud(all_titles, "Title Word Cloud", axs[0])
generate_wordcloud(all_articles, "Article Word Cloud", axs[1])

# Add space between the subplots
fig.subplots_adjust(wspace=0.5)

plt.tight_layout()
plt.show()

news_dataframes = {}

# Names of different preprocessing strategies
names = ['pos', 'bigrams', 'word_embeddings']

# Initialize separate copies for each preprocessing method
for name in names:
    news_dataframes[name] = filtered_news.copy()

# Now you can access each individual dataframe
pos_news = news_dataframes['pos']
bigrams_news = news_dataframes['bigrams']
word_embeddings_news = news_dataframes['word_embeddings']

# Define a function to perform POS tagging
def pos_tagging(tokens):
    return nltk.pos_tag(tokens)

# Apply POS tagging to the 'article' column
pos_news['article'] = pos_news['article'].apply(pos_tagging)
pos_news['title'] = pos_news['title'].apply(pos_tagging)

pos_output_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/pos_filtered_news_befor_santiment.csv"
pos_news.to_csv(pos_output_path, index=False)
del pos_news
gc.collect()

0

# Handling Bi-grams and Tri-grams
# Define a function to generate n-grams
def generate_ngrams(tokens, n=2):
    return list(nltk.ngrams(tokens, n))

# Apply n-gram generation to the 'article' column
bigrams_news['article'] = bigrams_news['article'].apply(generate_ngrams)
bigrams_news['title'] = bigrams_news['title'].apply(generate_ngrams)

bigrams_output_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/ngrams_filtered_news_befor_santiment.csv"
bigrams_news.to_csv(bigrams_output_path, index=False)
del bigrams_news
gc.collect()

0

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
from transformers import AutoTokenizer, AutoModel

input_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/filtered_news_befor_finbert_embeddings.csv"
output_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/filtered_news_with_finbert_embeddings.csv"
error_data = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/filtered_news_error_finbert_embeddings.csv"

if IMPORT_FILES:
    word_embeddings_news = pd.read_csv(output_path, low_memory=False)
        
    # Drop NaN rows and remove unwanted columns
    word_embeddings_news = word_embeddings_news.dropna().loc[:, ~word_embeddings_news.columns.str.contains('^Unnamed')]

else:
    
    # ✅ Ensure progress bars display correctly in Jupyter
    tqdm.pandas()


    # ✅ Chunk size for processing
    chunk_size = 5000

    # **Preprocessing Function**
    def preprocess_text(text):
        if isinstance(text, list):  # If column contains lists of tokens
            text = " ".join(text)  # Convert list to string
        return str(text).strip() if pd.notna(text) else ""

    # **Function to generate FinBERT sentence embeddings with error handling**
    def generate_finbert_embeddings(text, index):
        if not isinstance(text, str) or text.strip() == "":
            return np.zeros(768)  # FinBERT outputs 768-dimensional embeddings

        try:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = finbert_model(**inputs)
                embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Move to CPU before conversion

            return embeddings.squeeze()  # Ensure correct shape
        except Exception as e:
            return f"ERROR: {str(e)}"  # Return error message for logging

    # **Load FinBERT Model & Tokenizer**
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"✅ Using device: {device}")
    tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    finbert_model = AutoModel.from_pretrained("yiyanghkust/finbert-tone").to(device)
    finbert_model.eval()  # Set to evaluation mode

    # **Process data in chunks**
    error_rows = []  # List to store rows with errors
    error_details = []  # Store error messages

    for chunk_index, chunk in enumerate(pd.read_csv(input_path, chunksize=chunk_size)):
        print(f"✅ Processing chunk {chunk_index + 1}...")

        # Preprocess the text data (article and title)
        chunk['article'] = chunk['article'].apply(preprocess_text)
        chunk['title'] = chunk['title'].apply(preprocess_text)

        try:
            # Generate FinBERT Embeddings with progress bars
            chunk['article_embeddings'] = chunk.progress_apply(
                lambda row: generate_finbert_embeddings(row['article'], row.name), axis=1
            )
            chunk['title_embeddings'] = chunk.progress_apply(
                lambda row: generate_finbert_embeddings(row['title'], row.name), axis=1
            )

            # Identify and log problematic rows
            errors_in_chunk = chunk[chunk['article_embeddings'].astype(str).str.startswith("ERROR")]
            if not errors_in_chunk.empty:
                error_rows.append(errors_in_chunk.drop(columns=['article_embeddings', 'title_embeddings']))
                error_details.append(errors_in_chunk[['article_embeddings', 'title_embeddings']])

            # Replace error messages with default zero embeddings
            chunk.loc[chunk['article_embeddings'].astype(str).str.startswith("ERROR"), 'article_embeddings'] = list(np.zeros(768))
            chunk.loc[chunk['title_embeddings'].astype(str).str.startswith("ERROR"), 'title_embeddings'] = list(np.zeros(768))

            # Save chunk to output file immediately (without extra unnamed columns)
            chunk.to_csv(output_path, mode='a', header=(chunk_index == 0), index=False)
            print(f"✅ Chunk {chunk_index + 1} processed and saved.")

        except Exception as e:
            print(f"⚠️ Critical Error processing chunk {chunk_index + 1}: {str(e)}")

    # **Save error data if found**
    if error_rows:
        error_df = pd.concat(error_rows, ignore_index=True)
        error_details_df = pd.concat(error_details, ignore_index=True)
        error_df.to_csv(error_data, index=False)
        print(f"⚠️ Errors found in {len(error_rows)} chunks. Check '{error_data}' for details.")

    # **Move model to CPU and free GPU memory**
    finbert_model.to("cpu")
    del finbert_model
    torch.cuda.empty_cache()  # Free GPU memory

    print("✅ Processing complete.")

import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Ensure embeddings are properly extracted
article_embeddings = np.array([np.fromstring(str(embedding).strip("[]"), sep=' ') 
                               for embedding in word_embeddings_news['article_embeddings']])
title_embeddings = np.array([np.fromstring(str(embedding).strip("[]"), sep=' ') 
                             for embedding in word_embeddings_news['title_embeddings']])

# Check dimensions before applying PCA
print(f"Article embeddings shape: {article_embeddings.shape}")
print(f"Title embeddings shape: {title_embeddings.shape}")

# Ensure PCA components don't exceed feature size
n_components = min(3, article_embeddings.shape[1])

# Reduce dimensions using PCA
article_embeddings_3d = PCA(n_components=n_components).fit_transform(article_embeddings)
title_embeddings_3d = PCA(n_components=n_components).fit_transform(title_embeddings)

# Create a 1x2 grid for the plots
fig, axs = plt.subplots(1, 2, figsize=(24, 8), subplot_kw={'projection': '3d'})

# Plot 3D embeddings for articles
axs[0].scatter(article_embeddings_3d[:, 0], article_embeddings_3d[:, 1], article_embeddings_3d[:, 2], color='blue', label='Articles')
axs[0].set_title("3D Word Embeddings for Articles")
axs[0].set_xlabel("X")
axs[0].set_ylabel("Y")
axs[0].set_zlabel("Z")
axs[0].legend()

# Plot 3D embeddings for titles
axs[1].scatter(title_embeddings_3d[:, 0], title_embeddings_3d[:, 1], title_embeddings_3d[:, 2], color='green', label='Titles')
axs[1].set_title("3D Word Embeddings for Titles")
axs[1].set_xlabel("X")
axs[1].set_ylabel("Y")
axs[1].set_zlabel("Z")
axs[1].legend()

plt.tight_layout()
plt.show()

Article embeddings shape: (79451, 768)
Title embeddings shape: (79451, 768)

import pandas as pd
import numpy as np
import pandas_market_calendars as mcal
from datetime import datetime
import sys

loc_to_delete= [
    'news_df',
    'article_embeddings',
    'title_embeddings',
    'filtered_news',
    'all_articles',
    'all_titles',
    'article_embeddings_3d',
    'title_embeddings_3d',
    'news_counts'
]

for name in loc_to_delete:
    if name in locals():
        del globals()[name]  

gc.collect()

# ---------------------------
# Setup NYSE calendar and date processing
# ---------------------------
# Define the NYSE calendar
nyse = mcal.get_calendar('NYSE')

# Before converting, print columns to check the correct name for date column
print("Columns in word_embeddings_news:", word_embeddings_news.columns.tolist())

# Convert 'Date' column to datetime format (adjust the key if your column name is different)
# Here we assume the column is named "Date". If not, change accordingly.
word_embeddings_news['Date'] = pd.to_datetime(word_embeddings_news['Date'], errors='coerce')

# Drop rows with invalid dates
word_embeddings_news = word_embeddings_news.dropna(subset=['Date'])

# Get the minimum and maximum dates in the data
min_date = word_embeddings_news['Date'].min()
max_date = word_embeddings_news['Date'].max()
print(f"Min Date: {min_date}, Max Date: {max_date}")

# Get valid trading days from the NYSE calendar between min and max dates
trading_days = pd.to_datetime(nyse.valid_days(start_date=min_date, end_date=max_date)).tz_localize(None)

# Extract existing news dates (as a set)
news_dates = set(word_embeddings_news['Date'])

# Find missing trading days (those present in trading_days but not in our data)
missing_days = set(trading_days) - news_dates

# Convert missing days to DataFrame, add year column, and count per year
missing_days_df = pd.DataFrame({'Missing Date': list(missing_days)})
missing_days_df['Year'] = missing_days_df['Missing Date'].dt.year
missing_counts = missing_days_df.groupby('Year').size()

print("Missing trading days per year:")
print(missing_counts)

Columns in word_embeddings_news: ['Date', 'month', 'day', 'title', 'article', 'Ticker', 'article_embeddings', 'title_embeddings']
Min Date: 2016-01-01 00:00:00, Max Date: 2020-04-01 00:00:00
Missing trading days per year:
Year
2017    68
dtype: int64

import numpy as np
import pandas as pd
import torch
import gc
from transformers import BertTokenizer, BertForSequenceClassification

# **Check for GPU availability**
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Using device: {device}")

# **Load FinBERT model and tokenizer**
tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone").to(device)
model.eval()  # Set model to evaluation mode

def calculate_sentiment_with_finbert(texts, batch_size=32):
    """Processes a batch of texts using FinBERT and returns sentiment scores."""
    scores = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize inputs
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)

        # **Inference without gradient calculation**
        with torch.no_grad():
            outputs = model(**inputs)
            sentiments = torch.nn.functional.softmax(outputs.logits, dim=-1)
            sentiment_scores = sentiments[:, 1].cpu().tolist()  # Extract positive sentiment score
            scores.extend(sentiment_scores)

    return scores

✅ Using device: cpu

# **File paths for input and output**
file_paths = {
    "pos": "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/pos_filtered_news_befor_santiment.csv",
    "bigrams": "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/ngrams_filtered_news_befor_santiment.csv",
    "word_embeddings": "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/filtered_news_with_finbert_embeddings.csv"
}

output_paths = {
    name: f"C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/{name}_news_with_sentiment.csv"
    for name in file_paths
}
if IMPORT_FILES:
    for name in names:
        news_dataframes[name] = pd.read_csv(output_paths[name],low_memory=False)
else:
    # **Chunk size for processing**
    chunk_size = 10_000

    # **Process each DataFrame in chunks**
    for name, input_path in file_paths.items():
        output_path = output_paths[name]
        
        print(f"✅ Processing '{name}' dataset...")

        # Read and process the file in chunks
        for chunk_index, chunk in enumerate(pd.read_csv(input_path, chunksize=chunk_size, low_memory=False)):
            print(f"🔹 Processing chunk {chunk_index + 1} for '{name}' dataset...")

            # Drop NaN rows and remove unwanted columns
            chunk = chunk.dropna().loc[:, ~chunk.columns.str.contains('^Unnamed')]

            try:
                # Compute sentiment scores for 'article' and 'title'
                chunk['article_score'] = calculate_sentiment_with_finbert(chunk['article'].tolist(), batch_size=128)
                chunk['title_score'] = calculate_sentiment_with_finbert(chunk['title'].tolist(), batch_size=128)

                # Save processed chunk (append mode to avoid overwriting)
                chunk.to_csv(output_path, mode='a', header=(chunk_index == 0), index=False)
                print(f"✅ Chunk {chunk_index + 1} processed and saved for '{name}' dataset.")

            except Exception as e:
                print(f"⚠️ Error processing chunk {chunk_index + 1} for '{name}': {str(e)}")

            # Free memory after each chunk
            gc.collect()

    # **Free GPU memory after processing all datasets**
    model.to("cpu")
    del model
    torch.cuda.empty_cache()

    print("✅ All datasets processed successfully.")

import pandas as pd
import numpy as np
import ast
import re

def parse_embedding(s):
    """
    Convert an embedding string into a numpy array.
    If the string is well-formed (e.g., with commas), ast.literal_eval works.
    If commas are missing (e.g., "[1.0 2.0 3.0]"), we split on whitespace.
    """
    if not isinstance(s, str):
        return None
    s = s.strip()
    if s.startswith('[') and s.endswith(']'):
        try:
            # First try the literal_eval (works if commas are present)
            return np.array(ast.literal_eval(s), dtype=np.float32)
        except Exception:
            # If that fails, try splitting on whitespace
            inner = s[1:-1].strip()
            # Split by any whitespace
            parts = re.split(r'\s+', inner)
            try:
                return np.array([float(x) for x in parts], dtype=np.float32)
            except Exception:
                return None
    return None

import pandas as pd
import numpy as np
import ast
import re

news_dataframes = {}
names = ['pos', 'bigrams', 'word_embeddings']

ticker_mapping = {'APPLE': 'AAPL', 'AMAZON': 'AMZN', 'NETFLIX': 'NFLX'}

output_paths = {name: f"C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/{name}_news_with_sentiment.csv" for name in names}

for name in names:
    print(f"Processing {name}...")
    
    # Load data
    df = pd.read_csv(output_paths[name], low_memory=False)
    
    # Convert article_score and title_score to float32 if available
    for col in ['article_score', 'title_score']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(np.float32)
    
    # Drop unnecessary columns and rows with missing data
    df.drop(columns=['month', 'day', 'title', 'article'], errors='ignore', inplace=True)
    df.dropna(inplace=True)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
    
    # Convert Date column to datetime
    df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True, errors='coerce')
    df.dropna(subset=['Date'], inplace=True)
    
    # Standardize Tickers
    df['Ticker'] = df['Ticker'].str.strip().str.upper().replace(ticker_mapping)
    
    # --- If WE dataframe, parse embeddings for all rows ---
    if name == 'word_embeddings':
        for col in ['article_embeddings', 'title_embeddings']:
            if col in df.columns:
                df[col] = df[col].apply(parse_embedding)
    
    # --- Aggregate GENERAL rows from the original df ---
    general_df = df[df['Ticker'] == 'GENERAL'].copy().reset_index(drop=True)
    
    # For scores aggregation (applies to all dfs)
    general_scores = general_df.groupby('Date', as_index=False).agg({
        'article_score': 'mean',
        'title_score': 'mean'
    }).rename(columns={
        'article_score': 'mean_general_article_score',
        'title_score': 'mean_general_title_score'
    })
    
    # For word embeddings, also aggregate embeddings from GENERAL rows
    if name == 'word_embeddings':
        # Remove any rows where embeddings could not be parsed
        general_df = general_df.dropna(subset=['article_embeddings', 'title_embeddings'])
        
        # Group by Date to compute mean embeddings
        general_embeddings = general_df.groupby('Date', as_index=False).agg({
            'article_embeddings': lambda x: np.mean(np.vstack(x), axis=0),
            'title_embeddings': lambda x: np.mean(np.vstack(x), axis=0)
        })
        # Rename columns for clarity
        general_embeddings.rename(columns={
            'article_embeddings': 'mean_general_article_embeddings',
            'title_embeddings': 'mean_general_title_embeddings'
        }, inplace=True)
        
        # Merge the embeddings with the general scores
        general_agg = general_scores.merge(general_embeddings, on='Date', how='left')
    else:
        general_agg = general_scores.copy()
    
    # --- Remove GENERAL rows from the main dataframe ---
    df = df[df['Ticker'] != 'GENERAL'].copy()
    
    # --- Merge aggregated GENERAL features into the main df ---
    df = df.merge(general_agg, on='Date', how='left')
    
    # Fill missing scores with 0
    df['mean_general_article_score'] = df['mean_general_article_score'].fillna(0).astype(np.float32)
    df['mean_general_title_score'] = df['mean_general_title_score'].fillna(0).astype(np.float32)
    
    # For WE df, also fill missing embeddings with zeros of proper shape.
    if name == 'word_embeddings':
        # Determine embedding dimension:
        emb_dim = None
        if 'mean_general_article_embeddings' in df.columns:
            sample_series = df['mean_general_article_embeddings'].dropna()
            if not sample_series.empty:
                sample = sample_series.iloc[0]
                emb_dim = len(sample) if isinstance(sample, np.ndarray) else None
        if emb_dim is None and 'article_embeddings' in df.columns:
            sample = df['article_embeddings'].iloc[0]
            if isinstance(sample, str):
                sample = parse_embedding(sample)
            emb_dim = len(sample) if sample is not None else 300  # default fallback
        if emb_dim is None:
            emb_dim = 300
        
        # Fill missing aggregated embeddings with zeros
        for col in ['mean_general_article_embeddings', 'mean_general_title_embeddings']:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: x if isinstance(x, np.ndarray) else np.zeros(emb_dim, dtype=np.float32))
    
    news_dataframes[name] = df
    
    print(df.describe(include='all'))
    print("-" * 50)

Processing pos...
                                 Date Ticker  article_score   title_score  \
count                           56171  56171   5.617100e+04  5.617100e+04   
unique                            NaN      3            NaN           NaN   
top                               NaN   AMZN            NaN           NaN   
freq                              NaN  24258            NaN           NaN   
mean    2018-03-23 07:54:05.197699840    NaN   4.034433e-03  3.438787e-03   
min               2016-01-01 00:00:00    NaN   6.052462e-07  7.931056e-07   
25%               2017-03-04 00:00:00    NaN   7.372523e-06  1.861340e-05   
50%               2018-04-11 00:00:00    NaN   1.332102e-05  4.169398e-05   
75%               2019-04-25 00:00:00    NaN   2.700800e-05  9.760138e-05   
max               2020-04-01 00:00:00    NaN   9.999988e-01  9.999982e-01   
std                               NaN    NaN   5.533271e-02  5.133732e-02   

        mean_general_article_score  mean_general_title_score  
count                 56171.000000              56171.000000  
unique                         NaN                       NaN  
top                            NaN                       NaN  
freq                           NaN                       NaN  
mean                      0.011738                  0.011091  
min                       0.000000                  0.000000  
25%                       0.000040                  0.000098  
50%                       0.000231                  0.000206  
75%                       0.006956                  0.004714  
max                       0.399279                  0.333360  
std                       0.026333                  0.025524  
--------------------------------------------------
Processing bigrams...
                                 Date Ticker  article_score   title_score  \
count                           56171  56171   56171.000000  5.617100e+04   
unique                            NaN      3            NaN           NaN   
top                               NaN   AMZN            NaN           NaN   
freq                              NaN  24258            NaN           NaN   
mean    2018-03-23 07:54:05.197699840    NaN       0.065034  2.673907e-02   
min               2016-01-01 00:00:00    NaN       0.000001  6.597840e-07   
25%               2017-03-04 00:00:00    NaN       0.000126  1.033489e-04   
50%               2018-04-11 00:00:00    NaN       0.000563  3.579496e-04   
75%               2019-04-25 00:00:00    NaN       0.003596  1.492333e-03   
max               2020-04-01 00:00:00    NaN       1.000000  1.000000e+00   
std                               NaN    NaN       0.221083  1.372901e-01   

        mean_general_article_score  mean_general_title_score  
count                 56171.000000              56171.000000  
unique                         NaN                       NaN  
top                            NaN                       NaN  
freq                           NaN                       NaN  
mean                      0.163091                  0.072481  
min                       0.000000                  0.000000  
25%                       0.088543                  0.016654  
50%                       0.151550                  0.056793  
75%                       0.225799                  0.105065  
max                       0.999969                  0.999856  
std                       0.106357                  0.070983  
--------------------------------------------------
Processing word_embeddings...

c:\Users\argam\anaconda3\envs\FinTrendLSTM\lib\site-packages\pandas\core\indexes\base.py:5360: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return key in self._engine
c:\Users\argam\anaconda3\envs\FinTrendLSTM\lib\site-packages\pandas\core\indexes\base.py:5360: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return key in self._engine
c:\Users\argam\anaconda3\envs\FinTrendLSTM\lib\site-packages\pandas\core\indexes\base.py:5360: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return key in self._engine

                                 Date Ticker  \
count                           55723  55723   
unique                            NaN      3   
top                               NaN   AMZN   
freq                              NaN  24030   
mean    2018-03-23 05:32:44.581950208    NaN   
min               2016-01-01 00:00:00    NaN   
25%               2017-03-03 00:00:00    NaN   
50%               2018-04-11 00:00:00    NaN   
75%               2019-04-25 00:00:00    NaN   
max               2020-04-01 00:00:00    NaN   
std                               NaN    NaN   

                                       article_embeddings  \
count                                               55723   
unique                                              55723   
top     [0.118718304, 0.3063625, -0.6674092, 1.4475749...   
freq                                                    1   
mean                                                  NaN   
min                                                   NaN   
25%                                                   NaN   
50%                                                   NaN   
75%                                                   NaN   
max                                                   NaN   
std                                                   NaN   

                                         title_embeddings  article_score  \
count                                               55723   55723.000000   
unique                                              55723            NaN   
top     [-0.6304295, 0.15601356, -1.2643281, 1.5807698...            NaN   
freq                                                    1            NaN   
mean                                                  NaN       0.145846   
min                                                   NaN       0.000002   
25%                                                   NaN       0.001735   
50%                                                   NaN       0.008870   
75%                                                   NaN       0.069626   
max                                                   NaN       1.000000   
std                                                   NaN       0.299744   

         title_score  mean_general_article_score  mean_general_title_score  \
count   5.572300e+04                55723.000000              55723.000000   
unique           NaN                         NaN                       NaN   
top              NaN                         NaN                       NaN   
freq             NaN                         NaN                       NaN   
mean    7.484687e-02                    0.287576                  0.185701   
min     2.776358e-07                    0.000000                  0.000000   
25%     2.845135e-04                    0.196165                  0.111683   
50%     1.499438e-03                    0.277786                  0.174203   
75%     1.125928e-02                    0.371390                  0.250514   
max     1.000000e+00                    1.000000                  1.000000   
std     2.242050e-01                    0.136161                  0.111371   

                          mean_general_article_embeddings  \
count                                               55723   
unique                                               1712   
top     [-0.2077517, 0.20665751, -0.57893425, 0.775946...   
freq                                                   60   
mean                                                  NaN   
min                                                   NaN   
25%                                                   NaN   
50%                                                   NaN   
75%                                                   NaN   
max                                                   NaN   
std                                                   NaN   

                            mean_general_title_embeddings  
count                                               55723  
unique                                               1712  
top     [-0.81549525, 0.28209424, -1.2759625, 0.803575...  
freq                                                   60  
mean                                                  NaN  
min                                                   NaN  
25%                                                   NaN  
50%                                                   NaN  
75%                                                   NaN  
max                                                   NaN  
std                                                   NaN  
--------------------------------------------------

c:\Users\argam\anaconda3\envs\FinTrendLSTM\lib\site-packages\pandas\core\indexes\base.py:5360: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return key in self._engine

import os
import yfinance as yf
all_data_file_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/all_stocks_data.csv"

if IMPORT_FILES:
    # Load the stock data
    financial_data = pd.read_csv(all_data_file_path)
    
    # Convert Date column to datetime
    financial_data['Date'] = pd.to_datetime(financial_data['Date'])
    
    # Standardize Tickers (trim spaces, convert to uppercase)
    financial_data['Ticker'] = financial_data['Ticker'].str.strip().str.upper()

else:
    # Define the list of stock symbols and date range
    stocks = ["AAPL", "AMZN", "NFLX"]
    start_date = "2016-01-01"
    end_date = "2020-12-31"
    stock_data_path = "C:/Users/argam/Documents/GitHub/FinTrendLSTM/data"

    # Add a directory to save the dataframes
    data_frames = {}

    # Ensure the save path exist
    os.makedirs(stock_data_path, exist_ok=True)

    # Loop through the stocks and download the data
    for stock in stocks:
        data = yf.download(stock, start=start_date, end=end_date)

        # Reset the index to make Date a column
        data.reset_index(inplace=True)

        # Add the Ticker column
        data['Ticker'] = stock   

        #print all the unfolded lyres of the dataframe loop ovrer data and unfold it
        data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in data.columns]

        # rename the columns bu rmoving the everythign after the underscore including the underscore
        data.columns = [col.split("_")[0] for col in data.columns]

        # Keep only necessary columns
        data = data[["Ticker", "Date", "Open", "Close", "High", "Low", "Volume"]]

        # Add the data to the dictionary
        data_frames[stock] = data

        # Save the data to a CSV file
        file_path = os.path.join(stock_data_path, f"{stock}_stock_data.csv")
        data.to_csv(file_path, index=False)
        
    # combine all the dataframes into a single dataframe
    for stock, data in data_frames.items():
        if stock == stocks[0]:
            all_data = data
        else:
            all_data = pd.concat([all_data, data], ignore_index=True, axis=0)

    # Save the combined data to a CSV file
    all_data.to_csv(all_data_file_path, index=False)

import pandas as pd

# Function to calculate RSI
def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)  

    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    
    return rsi

# Function to calculate SMA
def calculate_sma(data, window=14):
    return data.rolling(window=window, min_periods=1).mean()

# Function to calculate Stochastic Oscillator %K
def calculate_stochastic_oscillator(data, window=14):
    low_min = data['Low'].rolling(window=window, min_periods=1).min()
    high_max = data['High'].rolling(window=window, min_periods=1).max()
    stochastic_k = 100 * (data['Close'] - low_min) / (high_max - low_min)
    return stochastic_k

# Ensure each financial metric is calculated per stock ticker
financial_data['RSI'] = financial_data.groupby('Ticker')['Close'].transform(calculate_rsi)
financial_data['SMA'] = financial_data.groupby('Ticker')['Close'].transform(calculate_sma)

# Fix for Stochastic Oscillator %K to return a Series, avoiding the ValueError
financial_data['%K'] = financial_data.groupby('Ticker', group_keys=False)['Close'].transform(
    lambda x: calculate_stochastic_oscillator(financial_data.loc[x.index], window=14)
)

# ✅ **Fix: Correct `Class` Calculation for Regression**
financial_data['Class'] =  financial_data['Close'] - financial_data['Open']

# ✅ **Fix: Normalize Prices**
financial_data['Close'] = financial_data.groupby('Ticker')['Close'].transform(lambda x: (x - x.mean()) / x.std())
financial_data['Open'] = financial_data.groupby('Ticker')['Open'].transform(lambda x: (x - x.mean()) / x.std())
financial_data['High'] = financial_data.groupby('Ticker')['High'].transform(lambda x: (x - x.mean()) / x.std())
financial_data['Low'] = financial_data.groupby('Ticker')['Low'].transform(lambda x: (x - x.mean()) / x.std())

# Debugging: Print min/max values
print(financial_data[['Close', 'Class']].describe())

# Display the first few rows for each column; for 'Ticker' print all unique values
for col in financial_data.columns:
    if col == 'Ticker':
        temp = financial_data[col].unique()
        print(f'{col} :', temp)
        del temp  # Remove temporary variable
    else:
        print(f'{col} :', financial_data[col].min(), financial_data[col].max())

              Close        Class
count  3.774000e+03  3774.000000
mean   7.530925e-17     0.042013
std    9.997349e-01     3.965401
min   -1.456981e+00   -42.479980
25%   -8.684492e-01    -0.582996
50%   -6.745292e-02     0.023996
75%    5.324786e-01     0.666679
max    3.265605e+00    55.030029
Ticker : ['AAPL' 'AMZN' 'NFLX']
Date : 2016-01-04 00:00:00 2020-12-30 00:00:00
Open : -1.4738189148522423 3.3179012392395575
Close : -1.4569807843961589 3.265605330664182
High : -1.4496758524740394 3.2744137826463975
Low : -1.47138237369525 3.2528964121925474
Volume : 1144000 533478800
RSI : 0.0 96.20447525782778
SMA : 21.277334349496023 530.5142822265625
%K : 0.0 100.0
Class : -42.47998046875 55.030029296875

# Normalize function to scale 0 to 1
def normalize_zero(column):
    return (column - column.min()) / (column.max() - column.min())

# Normalize function to scale -1 to 1
def normalize_minus_one(column):
    return 2 * ((column - column.min()) / (column.max() - column.min())) - 1

# Columns that still need normalization
zero_normalized_columns = ['Volume', 'RSI', 'SMA', '%K']
minus_one_normalized_columns = ['Class']

# Apply normalization
for col in zero_normalized_columns:
    financial_data[col] = normalize_zero(financial_data[col])

for col in minus_one_normalized_columns:
    financial_data[col] = normalize_minus_one(financial_data[col])

# Convert columns to float32 for memory efficiency
for col in financial_data.columns:
    if col not in ['Date', 'Ticker']:
        financial_data[col] = financial_data[col].astype(np.float32)

# Debug print
print(financial_data[['Volume', 'RSI', 'SMA', '%K', 'Class']].describe())

            Volume          RSI          SMA           %K        Class
count  3774.000000  3771.000000  3774.000000  3774.000000  3774.000000
mean      0.142606     0.587927     0.217590     0.612324    -0.127844
std       0.126602     0.178183     0.247541     0.295650     0.081333
min       0.000000     0.000000     0.000000     0.000000    -1.000000
25%       0.018515     0.463059     0.040581     0.360667    -0.140663
50%       0.132042     0.596841     0.130074     0.680343    -0.128213
75%       0.210005     0.721643     0.268836     0.878127    -0.115031
max       1.000000     1.000000     1.000000     1.000000     1.000000

import matplotlib.pyplot as plt
import seaborn as sns
import math

# Columns to plot (excluding Date and Ticker)
columns_to_plot = [col for col in financial_data.columns if col not in ['Date', 'Ticker']]
num_columns = 3  # 3 plots per row
num_rows = math.ceil(len(columns_to_plot) / num_columns)  # Auto-adjust rows

# Get unique tickers and assign each a color
unique_tickers = financial_data['Ticker'].unique()
palette = sns.color_palette("husl", len(unique_tickers))  # Generate unique colors

# Create subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(20, 4 * num_rows))
axes = axes.flatten()  # Flatten for easier indexing

# Generate violin plots with hue to suppress the warning
for i, col in enumerate(columns_to_plot):
    ax = axes[i]
    sns.violinplot(
        data=financial_data,
        x='Ticker',
        y=col,
        hue='Ticker',
        dodge=False,
        ax=ax,
        palette=palette
    )
    ax.set_title(f"{col} Distribution by Stock")
    ax.set_xlabel("Stock")
    ax.set_ylabel(col)
    # Remove legend for clarity (legend duplicates x-axis labels)
    if ax.get_legend():
        ax.get_legend().remove()

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Combain the news and financial data
# First we will get one line per date per stock by taking the mean of the article and title scores
# Then we will merge the news and financial data on the date and stock ticker keeping only the financial data and 
# article and title scores
# we will save the new data frames in a dictionary so we will have a unique data frame for each preprocessed strategy
# at the moment we will not use the word embeddings as it is not clear how to combine the data
from sklearn.decomposition import PCA
import os

final_dataframes = {}

for name, df in news_dataframes.items():
    # Merge general news mean values into the main dataset
    df = df.merge(financial_data, on=['Date', 'Ticker'], how='left')

    #Save the data frame as a csv file
    # output_file_path = f"C:/Users/argam/Documents/GitHub/FinTrendLSTM/data/{name}_news_with_financial_data.csv"
    # df.to_csv(output_file_path, index=False)
    final_dataframes[name] = df

import numpy as np

ticker_map = {'AAPL': '1', 'AMZN': '2', 'NFLX': '3'}

for name, data in final_dataframes.items():
    data['Ticker'] = data['Ticker'].replace(ticker_map)
    data['Ticker'] = data['Ticker'].astype('float32')
    data.fillna(0, inplace=True)
    
    
    for col in data.columns:
        if "embeddings" in col:
            sample = data[col].iloc[0]  # Get first element
            if isinstance(sample, np.ndarray):
                print(f"{col} {data[col].dtype}: NumPy array with shape {sample.shape}")
            else:
                print(f"{col} {data[col].dtype}: ❌ NOT NumPy (type: {type(sample)}")
        else:
            print(f"{col} : {data[col].dtype}")
    
    print("\n")  # Add tab spacing per data frame

f_pos_news = final_dataframes['pos']
f_bigrams_news = final_dataframes['bigrams']
f_word_embeddings_news = final_dataframes['word_embeddings']

Date : datetime64[ns]
Ticker : float32
article_score : float32
title_score : float32
mean_general_article_score : float32
mean_general_title_score : float32
Open : float32
Close : float32
High : float32
Low : float32
Volume : float32
RSI : float32
SMA : float32
%K : float32
Class : float32


Date : datetime64[ns]
Ticker : float32
article_score : float32
title_score : float32
mean_general_article_score : float32
mean_general_title_score : float32
Open : float32
Close : float32
High : float32
Low : float32
Volume : float32
RSI : float32
SMA : float32
%K : float32
Class : float32


Date : datetime64[ns]
Ticker : float32
article_embeddings object: NumPy array with shape (768,)
title_embeddings object: NumPy array with shape (768,)
article_score : float32
title_score : float32
mean_general_article_score : float32
mean_general_title_score : float32
mean_general_article_embeddings object: NumPy array with shape (768,)
mean_general_title_embeddings object: NumPy array with shape (768,)
Open : float32
Close : float32
High : float32
Low : float32
Volume : float32
RSI : float32
SMA : float32
%K : float32
Class : float32

import pandas as pd
import pandas_market_calendars as mcal
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

for name, data in final_dataframes.items():
    # Define the NYSE calendar
    nyse = mcal.get_calendar('NYSE')

    # Convert 'Date' column to datetime format
    data['Date'] = pd.to_datetime(data['Date'])

    # Define date range
    start_date = data['Date'].min()
    end_date = data['Date'].max()

    # Check if any rows in 'data' are outside the valid date range
    print(f"{name} date range: {data['Date'].min()} to {data['Date'].max()}")

    # Group by 'Date' and 'Ticker' to count articles per day per stock
    news_counts = data.groupby(['Date', 'Ticker']).size().unstack(fill_value=0)

    # Get valid trading days from the NYSE calendar
    trading_days = pd.to_datetime(nyse.valid_days(start_date=start_date, end_date=end_date))

    # Remove any timezone information from trading_days to match data dates
    trading_days = trading_days.tz_localize(None)

    # Reindex to ensure all trading days are present (fill missing with 0)
    news_counts = news_counts.reindex(trading_days, fill_value=0)

    # Remove "noisy" data: Set values greater than 150 to NaN (or 0)
    news_counts[news_counts > 150] = 0

    # Count the number of days with 0 articles for at least one ticker
    days_with_no_articles = (news_counts == 0).any(axis=1).sum()

    # Print the count of days with zero articles for at least one ticker
    print(f"Number of days with zero articles for at least one ticker: {days_with_no_articles}")

    # Get tickers
    tickers = news_counts.columns.tolist()

    # Set up subplot grid (2x2 for up to 4 tickers)
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))
    axes = axes.flatten()

    # Iterate over tickers and plot
    for i, ticker in enumerate(tickers[:4]):  # Ensure we only plot up to 4 tickers
        ax = axes[i]
        sns.histplot(news_counts[ticker], bins=20, kde=False, ax=ax, color='blue')
        
        # Calculate statistics
        avg = news_counts[ticker].mean()
        median = news_counts[ticker].median()
        min_articles = news_counts[ticker].min()
        max_articles = news_counts[ticker].max()
        days_below_avg = (news_counts[ticker] < avg).sum()
        days_below_median = (news_counts[ticker] < median).sum()
        
        # Plot vertical lines for average and median
        ax.axvline(avg, color='red', linestyle='--', label=f'Avg: {avg:.1f}')
        ax.axvline(median, color='green', linestyle='--', label=f'Median: {median:.1f}')
        
        # Labels and title
        ax.set_title(f"Rows per Day on - {ticker}")
        ax.set_xlabel("Number of Articles")
        ax.set_ylabel("Trading Days")
        ax.legend()
        
        # Print stats
        print(f"{ticker} - Days below Avg: {days_below_avg}, Days below Median: {days_below_median}, Min: {min_articles}, Max: {max_articles}")

    # Remove unused subplots if there are fewer than 4 tickers
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()
    print(len(data))

pos date range: 2016-01-01 00:00:00 to 2020-04-01 00:00:00
Number of days with zero articles for at least one ticker: 7
1.0 - Days below Avg: 314, Days below Median: 382, Min: 1, Max: 20
2.0 - Days below Avg: 126, Days below Median: 126, Min: 1, Max: 20
3.0 - Days below Avg: 578, Days below Median: 499, Min: 0, Max: 20

56171
bigrams date range: 2016-01-01 00:00:00 to 2020-04-01 00:00:00
Number of days with zero articles for at least one ticker: 7
1.0 - Days below Avg: 314, Days below Median: 382, Min: 1, Max: 20
2.0 - Days below Avg: 126, Days below Median: 126, Min: 1, Max: 20
3.0 - Days below Avg: 578, Days below Median: 499, Min: 0, Max: 20

56171
word_embeddings date range: 2016-01-01 00:00:00 to 2020-04-01 00:00:00
Number of days with zero articles for at least one ticker: 7
1.0 - Days below Avg: 318, Days below Median: 459, Min: 1, Max: 20
2.0 - Days below Avg: 290, Days below Median: 290, Min: 1, Max: 20
3.0 - Days below Avg: 582, Days below Median: 505, Min: 0, Max: 20

55723

import pandas as pd
import numpy as np

def pad_and_transform(df: pd.DataFrame) -> pd.DataFrame:
    # Exclude Date, Class, and Ticker from feature selection
    feature_cols = [col for col in df.columns if col not in ["Date", "Class", "Ticker"]]
    
    # Ensure proper data types
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    df.sort_values(by=["Date", "Ticker"], inplace=True)
    
    transformed_data = []
    first_print = True  # Flag to print the first transformation
    
    for (date, ticker), group in df.groupby(["Date", "Ticker"], sort=False):
        class_value = group["Class"].iloc[0]  # Get the Class label
        group = group[feature_cols].values  # Convert group to numpy array
        num_rows, num_features = group.shape

        # Ensure group size doesn't exceed 20 (shouldn't happen due to slicing)
        if num_rows > 20:
            print(f"Warning: More than 20 rows found for {ticker} on {date}. Trimming excess.")
            group = group[:20]
        elif num_rows < 20:
            pad_rows = np.tile(group[-1], (20 - num_rows, 1))  # Repeat last row
            group = np.vstack([group, pad_rows])

        # Flatten the array into a single row
        flattened_features = group.flatten()

        # Append to transformed data
        transformed_row = [date, ticker] + flattened_features.tolist() + [class_value]
        transformed_data.append(transformed_row)
        
        # Print the first transformed row for debugging
        if first_print:
            print(f"First transformed row:\n{transformed_row}")
            first_print = False  # Disable further printing
    
    # Create final dataframe
    column_names = ["Date", "Ticker"] + [f"{feat}_{i+1}" for i in range(20) for feat in feature_cols] + ["Class"]
    transformed_df = pd.DataFrame(transformed_data, columns=column_names)
    
    return transformed_df

# Run transformation
LSTM_pos_data = pad_and_transform(final_dataframes['pos'])
LSTM_bigrams_data = pad_and_transform(final_dataframes['bigrams'])

First transformed row:
[Timestamp('2016-01-01 00:00:00'), 1.0, 8.950454684963915e-06, 4.4987267756368965e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.385632216028171e-06, 6.57253767712973e-05, 5.328757833922282e-05, 0.00040433567482978106, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
First transformed row:
[Timestamp('2016-01-01 00:00:00'), 1.0, 0.0001705357717582956, 0.00022141907538753003, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.350556890945882e-05, 0.0006421093130484223, 0.2836473286151886, 0.23861850798130035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import os
import shutil
import time
from datetime import datetime

# =============================================================================
# Split data by date
# =============================================================================
unique_dates = LSTM_pos_data['Date'].drop_duplicates().sort_values()
train_cutoff = int(0.8 * len(unique_dates))
train_dates = unique_dates[:train_cutoff].tolist()
test_dates  = unique_dates[train_cutoff:].tolist()

# =============================================================================
# Create datasets
# =============================================================================
class StockDataset(Dataset):
    def __init__(self, df, selected_dates=None, time_window=5):
        if selected_dates is not None:
            df = df[df['Date'].isin(selected_dates)].copy()
        df.sort_values(by=["Ticker", "Date"], inplace=True)
        self.df = df.reset_index(drop=True)
        self.time_window = time_window
        self.feature_cols = [col for col in df.columns if col not in ["Date", "Ticker", "Class"]]
        self.groups = {ticker: group.sort_values(by="Date").reset_index(drop=True) 
                       for ticker, group in self.df.groupby("Ticker")}
        self.sequences = [(ticker, i) for ticker, group in self.groups.items() 
                          if len(group) >= time_window 
                          for i in range(len(group) - time_window + 1)]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        ticker, start_idx = self.sequences[idx]
        group = self.groups[ticker].iloc[start_idx:start_idx + self.time_window]
        X = group[self.feature_cols].values.astype(np.float32)
        y = group["Class"].iloc[-1]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

train_dataset = StockDataset(LSTM_pos_data, selected_dates=train_dates)
test_dataset  = StockDataset(LSTM_pos_data, selected_dates=test_dates)

# For consistency with POS naming, rename train_loader as pos_train_loader
batch_size = 64
pos_train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
pos_test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# =============================================================================
# Define Model
# =============================================================================
class ImprovedLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=512, num_layers=3, dropout=0.2):
        super(ImprovedLSTMModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.lstm = nn.LSTM(256, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        lstm_out, _ = self.lstm(x)
        out = self.fc_out(lstm_out[:, -1, :])
        return out.squeeze(1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = len(train_dataset.feature_cols)
pos_model = ImprovedLSTMModel(input_size=input_size).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(pos_model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

# TensorBoard Setup
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = f"runs/lstm_model_h{512}_{timestamp}"
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)
writer = SummaryWriter(log_dir=log_dir)

# =============================================================================
# Train Model
# =============================================================================
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, scheduler=None):
    train_losses, val_losses = [], []
    prev_lr = optimizer.param_groups[0]['lr']
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        writer.add_scalars("Loss", {"Train": avg_train_loss, "Validation": avg_val_loss}, epoch)
        if scheduler is not None:
            scheduler.step(avg_val_loss)
            current_lr = optimizer.param_groups[0]['lr']
            if current_lr != prev_lr:
                print(f"LR changed to: {current_lr}")
                prev_lr = current_lr
        if epoch % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}] | Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    return train_losses, val_losses

num_epochs = 100
train_losses, val_losses = train_model(pos_model, pos_train_loader, pos_test_loader,
                                       criterion, optimizer, num_epochs, scheduler)

plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_losses, label='Train')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation')
plt.title("Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.legend()
plt.show()

# =============================================================================
# Evaluate POS Model
# =============================================================================
def evaluate_model(model, loader):
    model.eval()
    true_labels, pred_values = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            pred_values.extend(outputs.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())
    return np.array(true_labels), np.array(pred_values)

# Evaluate model and assign results to new variable names
pos_y_true, pos_y_pred = evaluate_model(pos_model, pos_test_loader)
pos_y_pred_class = (pos_y_pred >= 0).astype(int)
pos_y_true_class = (pos_y_true >= 0).astype(int)

# Compute evaluation metrics using the new variable names
pos_precision, pos_recall, pos_f1, _ = precision_recall_fscore_support(pos_y_true_class, pos_y_pred_class, average='binary')
pos_accuracy = accuracy_score(pos_y_true_class, pos_y_pred_class)

print(f"POS Model - Precision: {pos_precision:.4f}, Recall: {pos_recall:.4f}, F1: {pos_f1:.4f}, Accuracy: {pos_accuracy:.4f}")

# Precision/Recall Bar Plot for POS Model
plt.figure(figsize=(8, 5))
metrics = [pos_precision, pos_recall, pos_f1, pos_accuracy]
labels = ["Precision", "Recall", "F1-Score", "Accuracy"]
sns.barplot(x=labels, y=metrics, palette="viridis")
plt.ylim(0, 1)
plt.title("POS Model Evaluation Metrics")
plt.ylabel("Score")
plt.show()

# Optionally, store evaluation results in a dictionary for later combined visualization
pos_eval_results = {
    'true_labels': pos_y_true,
    'pred_values': pos_y_pred,
    'predicted_classes': pos_y_pred_class,
    'confusion_matrix': confusion_matrix(pos_y_true_class, pos_y_pred_class),
    'precision': pos_precision,
    'recall': pos_recall,
    'f1_score': pos_f1,
    'accuracy': pos_accuracy
}

# Compute and visualize confusion matrix using the new variables
cm_pos = confusion_matrix(pos_y_true_class, pos_y_pred_class)
sns.heatmap(cm_pos, annot=True, fmt='d', cmap='Blues')
plt.title("POS Model Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# =============================================================================
# Visualize Weight Distributions & Log to TensorBoard for POS Model
# =============================================================================
for name, param in pos_model.named_parameters():
    if "weight" in name:
        writer.add_histogram(name, param.data.cpu().numpy())

writer.add_graph(pos_model, next(iter(pos_train_loader))[0].to(device))
writer.add_scalar("Eval/Precision", pos_precision, num_epochs)
writer.add_scalar("Eval/Recall", pos_recall, num_epochs)
writer.add_scalar("Eval/F1", pos_f1, num_epochs)
writer.add_scalar("Eval/Accuracy", pos_accuracy, num_epochs)
writer.close()

# =============================================================================
# Save POS Model
# =============================================================================
torch.save(pos_model.state_dict(), "pos_best_lstm.pth")
print("💾 POS Model saved!")

Epoch [1/100] | Train Loss: 0.0053, Val Loss: 0.0064
Epoch [11/100] | Train Loss: 0.0031, Val Loss: 0.0060
LR changed to: 0.0005
Epoch [21/100] | Train Loss: 0.0026, Val Loss: 0.0043
LR changed to: 0.00025
Epoch [31/100] | Train Loss: 0.0011, Val Loss: 0.0025
LR changed to: 0.000125
Epoch [41/100] | Train Loss: 0.0007, Val Loss: 0.0016
Epoch [51/100] | Train Loss: 0.0005, Val Loss: 0.0011
Epoch [61/100] | Train Loss: 0.0004, Val Loss: 0.0007
LR changed to: 6.25e-05
LR changed to: 3.125e-05
Epoch [71/100] | Train Loss: 0.0004, Val Loss: 0.0007
LR changed to: 1.5625e-05
Epoch [81/100] | Train Loss: 0.0002, Val Loss: 0.0004
Epoch [91/100] | Train Loss: 0.0002, Val Loss: 0.0003

POS Model - Precision: 0.9893, Recall: 0.8939, F1: 0.9392, Accuracy: 0.9606

C:\Users\argam\AppData\Local\Temp\ipykernel_17448\414732101.py:176: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=labels, y=metrics, palette="viridis")

💾 POS Model saved!

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import os
import shutil
from datetime import datetime
import time

# =============================================================================
# Cell 1: Split Bigram Data by Date
# =============================================================================
unique_dates = LSTM_bigrams_data['Date'].drop_duplicates().sort_values()
train_cutoff = int(0.8 * len(unique_dates))
train_dates = unique_dates[:train_cutoff].tolist()
test_dates  = unique_dates[train_cutoff:].tolist()

# =============================================================================
# Cell 2: Create Bigram Dataset
# =============================================================================
class BigramStockDataset(Dataset):
    def __init__(self, df, selected_dates=None, time_window=5):
        if selected_dates is not None:
            df = df[df['Date'].isin(selected_dates)].copy()
        df.sort_values(by=["Ticker", "Date"], inplace=True)
        self.df = df.reset_index(drop=True)
        self.time_window = time_window
        # Use all columns except Date, Ticker, and Class as features
        self.feature_cols = [col for col in df.columns if col not in ["Date", "Ticker", "Class"]]
        self.groups = {ticker: group.sort_values(by="Date").reset_index(drop=True)
                       for ticker, group in self.df.groupby("Ticker")}
        self.sequences = [(ticker, i) for ticker, group in self.groups.items() 
                          if len(group) >= time_window 
                          for i in range(len(group) - time_window + 1)]
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        ticker, start_idx = self.sequences[idx]
        group = self.groups[ticker].iloc[start_idx:start_idx + self.time_window]
        X = group[self.feature_cols].values.astype(np.float32)
        y = group["Class"].iloc[-1]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

train_dataset_bigram = BigramStockDataset(LSTM_bigrams_data, selected_dates=train_dates)
test_dataset_bigram  = BigramStockDataset(LSTM_bigrams_data, selected_dates=test_dates)

# =============================================================================
# Cell 3: Create DataLoaders (preserve temporal order)
# =============================================================================
batch_size = 64
train_loader_bigram = DataLoader(train_dataset_bigram, batch_size=batch_size, shuffle=False)
test_loader_bigram  = DataLoader(test_dataset_bigram, batch_size=batch_size, shuffle=False)

# =============================================================================
# Cell 4: Define Bigram LSTM Model
# =============================================================================
# We'll use three dense layers before the LSTM. We set the hidden size to 512 and LSTM layers to 3.
class BigramLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=512, num_layers=3, dropout=0.2):
        super(BigramLSTMModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.lstm = nn.LSTM(256, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        lstm_out, _ = self.lstm(x)
        out = self.fc_out(lstm_out[:, -1, :])
        return out.squeeze(1)

# For the Bigram model, input size equals the number of feature columns in the bigram DataFrame.
bigram_input_size = len(train_dataset_bigram.feature_cols)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_bigram = BigramLSTMModel(input_size=bigram_input_size, hidden_size=512, num_layers=3, dropout=0.2).to(device)

print(model_bigram)
print(f"Number of trainable parameters: {sum(p.numel() for p in model_bigram.parameters() if p.requires_grad)}")

# =============================================================================
# Cell 5: Set up Loss, Optimizer, and Scheduler for Bigram Model
# =============================================================================
criterion_bigram = nn.MSELoss()
optimizer_bigram = optim.Adam(model_bigram.parameters(), lr=0.001, weight_decay=1e-5)
scheduler_bigram = optim.lr_scheduler.ReduceLROnPlateau(optimizer_bigram, mode='min', factor=0.5, patience=3)

# TensorBoard Setup for Bigram model
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir_bigram = f"runs/lstm_bigram_{512}_{timestamp}"
if os.path.exists(log_dir_bigram):
    shutil.rmtree(log_dir_bigram)
writer_bigram = SummaryWriter(log_dir=log_dir_bigram)

# =============================================================================
# Cell 6: Train Bigram Model
# =============================================================================
def train_model_bigram(model, train_loader, val_loader, criterion, optimizer, epochs, scheduler=None):
    train_losses, val_losses = [], []
    prev_lr = optimizer.param_groups[0]['lr']
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        writer_bigram.add_scalars("Loss", {"Train": avg_train_loss, "Validation": avg_val_loss}, epoch)
        
        if scheduler is not None:
            scheduler.step(avg_val_loss)
            current_lr = optimizer_bigram.param_groups[0]['lr']
            if current_lr != prev_lr:
                print(f"LR changed to: {current_lr}")
                prev_lr = current_lr
        
        if epoch % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}] | Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    return train_losses, val_losses

num_epochs_bigram = 100
train_losses_bigram, val_losses_bigram = train_model_bigram(model_bigram, train_loader_bigram, test_loader_bigram,
                                                             criterion_bigram, optimizer_bigram, num_epochs_bigram,
                                                             scheduler_bigram)

plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs_bigram+1), train_losses_bigram, label='Train')
plt.plot(range(1, num_epochs_bigram+1), val_losses_bigram, label='Validation')
plt.title("Bigram Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.legend()
plt.show()

# =============================================================================
# Cell 7: Evaluate Bigram Model (Updated Variable Names)
# =============================================================================
def evaluate_model_bigram(model, loader):
    model.eval()
    true_labels, pred_values = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            pred_values.extend(outputs.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())
    return np.array(true_labels), np.array(pred_values)

# Evaluate using model_bigram and test_loader_bigram
bigram_y_true, bigram_y_pred = evaluate_model_bigram(model_bigram, test_loader_bigram)

# Compute binary predictions (assumed threshold 0)
bigram_y_pred_class = (bigram_y_pred >= 0).astype(int)
# Also compute binary ground truth if necessary
bigram_y_true_class = (bigram_y_true >= 0).astype(int)

# Compute evaluation metrics for the Bigram Model
bigram_precision, bigram_recall, bigram_f1, _ = precision_recall_fscore_support(
    bigram_y_true_class, bigram_y_pred_class, average='binary')
bigram_accuracy = accuracy_score(bigram_y_true_class, bigram_y_pred_class)

print(f"Bigram - Precision: {bigram_precision:.4f}, Recall: {bigram_recall:.4f}, " +
      f"F1: {bigram_f1:.4f}, Accuracy: {bigram_accuracy:.4f}")

# Precision/Recall Bar Plot for Bigram Model
plt.figure(figsize=(8, 5))
metrics_bigram = [bigram_precision, bigram_recall, bigram_f1, bigram_accuracy]
labels_bigram = ["Precision", "Recall", "F1-Score", "Accuracy"]
sns.barplot(x=labels_bigram, y=metrics_bigram, palette="viridis")
plt.ylim(0, 1)
plt.title("Bigram Model Evaluation Metrics")
plt.ylabel("Score")
plt.show()

# Compute and visualize confusion matrix using the updated variables
bigram_cm = confusion_matrix(bigram_y_true_class, bigram_y_pred_class)
sns.heatmap(bigram_cm, annot=True, fmt='d', cmap='Blues')
plt.title("Bigram Model Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Save evaluation results into a dictionary for further use (optional)
bigram_eval_results = {
    'bigram_y_true': bigram_y_true,
    'bigram_y_pred': bigram_y_pred,
    'bigram_y_pred_class': bigram_y_pred_class,
    'confusion_matrix': bigram_cm,
    'bigram_precision': bigram_precision,
    'bigram_recall': bigram_recall,
    'bigram_f1': bigram_f1,
    'bigram_accuracy': bigram_accuracy
}

# =============================================================================
# Visualize Weight Distributions & Log to TensorBoard for Bigram Model
# =============================================================================
for name, param in model_bigram.named_parameters():
    if "weight" in name:
        writer_bigram.add_histogram(name, param.data.cpu().numpy())

writer_bigram.add_graph(model_bigram, next(iter(train_loader_bigram))[0].to(device))
writer_bigram.add_scalar("Eval/Precision", bigram_precision, num_epochs_bigram)
writer_bigram.add_scalar("Eval/Recall", bigram_recall, num_epochs_bigram)
writer_bigram.add_scalar("Eval/F1", bigram_f1, num_epochs_bigram)
writer_bigram.add_scalar("Eval/Accuracy", bigram_accuracy, num_epochs_bigram)
writer_bigram.close()

# =============================================================================
# Save Bigram Model
# =============================================================================
torch.save(model_bigram.state_dict(), "bigram_best_lstm.pth")
print("💾 Bigram Model saved!")

BigramLSTMModel(
  (fc1): Linear(in_features=240, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=256, bias=True)
  (lstm): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.2)
  (fc_out): Linear(in_features=512, out_features=1, bias=True)
)
Number of trainable parameters: 5973249
Epoch [1/100] | Train Loss: 0.0058, Val Loss: 0.0061
Epoch [11/100] | Train Loss: 0.0032, Val Loss: 0.0060
LR changed to: 0.0005
LR changed to: 0.00025
Epoch [21/100] | Train Loss: 0.0026, Val Loss: 0.0054
Epoch [31/100] | Train Loss: 0.0017, Val Loss: 0.0039
Epoch [41/100] | Train Loss: 0.0016, Val Loss: 0.0034
Epoch [51/100] | Train Loss: 0.0015, Val Loss: 0.0030
Epoch [61/100] | Train Loss: 0.0015, Val Loss: 0.0029
LR changed to: 0.000125
Epoch [71/100] | Train Loss: 0.0009, Val Loss: 0.0018
LR changed to: 6.25e-05
Epoch [81/100] | Train Loss: 0.0010, Val Loss: 0.0017
LR changed to: 3.125e-05
LR changed to: 1.5625e-05
Epoch [91/100] | Train Loss: 0.0007, Val Loss: 0.0016
LR changed to: 7.8125e-06
LR changed to: 3.90625e-06
LR changed to: 1.953125e-06

Bigram - Precision: 0.9387, Recall: 0.4920, F1: 0.6456, Accuracy: 0.8160

C:\Users\argam\AppData\Local\Temp\ipykernel_17448\1408467346.py:197: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=labels_bigram, y=metrics_bigram, palette="viridis")

💾 Bigram Model saved!

import pandas as pd
import numpy as np

def pad_and_transform_with_embeddings(df: pd.DataFrame, time_steps: int = 20) -> pd.DataFrame:
    # Define embedding and non-embedding feature columns
    embedding_cols = ['article_embeddings', 'title_embeddings',
                      'mean_general_article_embeddings', 'mean_general_title_embeddings']
    non_embedding_cols = [col for col in df.columns if col not in embedding_cols + ['Date', 'Ticker', 'Class']]
    
    # Ensure proper data types and sorting
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values(by=['Date', 'Ticker'], inplace=True)
    
    transformed_data = []
    
    for (date, ticker), group in df.groupby(['Date', 'Ticker'], sort=False):
        class_value = group['Class'].iloc[0]  # Get the Class label
        
        # Extract non-embedding features and pad if necessary
        non_embedding_data = group[non_embedding_cols].values
        num_rows = non_embedding_data.shape[0]
        
        if num_rows > time_steps:
            non_embedding_data = non_embedding_data[:time_steps]
        elif num_rows < time_steps:
            pad_rows = np.tile(non_embedding_data[-1], (time_steps - num_rows, 1))
            non_embedding_data = np.vstack([non_embedding_data, pad_rows])
        
        # Extract embedding features and pad if necessary
        embedding_data = {col: group[col].tolist() for col in embedding_cols}
        for col in embedding_cols:
            if num_rows > time_steps:
                embedding_data[col] = embedding_data[col][:time_steps]
            elif num_rows < time_steps:
                pad_embedding = np.zeros_like(embedding_data[col][-1])
                embedding_data[col].extend([pad_embedding] * (time_steps - num_rows))
        
        # Combine all features
        combined_features = []
        for i in range(time_steps):
            combined_features.extend(non_embedding_data[i])
            for col in embedding_cols:
                combined_features.append(embedding_data[col][i])
        
        transformed_row = [date, ticker] + combined_features + [class_value]
        transformed_data.append(transformed_row)
    
    # Create column names
    column_names = ['Date', 'Ticker']
    for i in range(1, time_steps + 1):
        for col in non_embedding_cols:
            column_names.append(f'{col}_{i}')
        for col in embedding_cols:
            column_names.append(f'{col}_{i}')
    column_names.append('Class')
    
    # Create final DataFrame
    transformed_df = pd.DataFrame(transformed_data, columns=column_names)
    
    return transformed_df

# Example usage:
LSTM_word_embeddings_transformed = pad_and_transform_with_embeddings(f_word_embeddings_news)

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import os, shutil
from datetime import datetime
import time
from math import sqrt

#############################################
# Parameters
#############################################
TIME_WINDOW = 5      # Length of sliding window in days
PADDED_ROWS = 20     # Number of rows per stock-date pair (after transformation)
EMBEDDING_SIZE = 768 # Embedding vector dimension

#############################################
# Cell 1: Split Transformed WE Data by Date
#############################################
# Assume LSTM_word_embeddings_transformed is your already transformed DataFrame
unique_dates = LSTM_word_embeddings_transformed['Date'].drop_duplicates().sort_values()
train_cutoff = int(0.8 * len(unique_dates))
train_dates = unique_dates[:train_cutoff].tolist()
test_dates  = unique_dates[train_cutoff:].tolist()

#############################################
# Cell 2: Create WE Transformed Dataset (WETfDataset)
#############################################
class WETfDataset(Dataset):
    """
    This dataset uses the transformed WE DataFrame.
    Each row in the DataFrame corresponds to one stock–date pair with flattened features.
    The expected column order is:
      Date, Ticker, then for each day from 1 to PADDED_ROWS:
          [Numeric columns] followed by
          [article_embeddings columns],
          [title_embeddings columns],
          [mean_general_article_embeddings columns],
          [mean_general_title_embeddings columns],
      and then Class.
      
    A sliding window (of TIME_WINDOW days) is built per ticker.
    """
    def __init__(self, df, selected_dates=None, time_window=TIME_WINDOW):
        if selected_dates is not None:
            df = df[df['Date'].isin(selected_dates)].copy()
        df.sort_values(by=["Ticker", "Date"], inplace=True)
        self.df = df.reset_index(drop=True)
        self.time_window = time_window
        
        # All columns except Date, Ticker, Class.
        all_features = [col for col in df.columns if col not in ["Date", "Ticker", "Class"]]
        # Numeric columns: those without "embeddings" in the name.
        self.numeric_cols = [col for col in all_features if "embeddings" not in col]
        # Embedding columns for the four groups:
        self.article_emb_cols = [col for col in all_features if col.startswith("article_embeddings")]
        self.title_emb_cols   = [col for col in all_features if col.startswith("title_embeddings") and "mean_general" not in col]
        self.mean_gen_article_emb_cols = [col for col in all_features if col.startswith("mean_general_article_embeddings")]
        self.mean_gen_title_emb_cols   = [col for col in all_features if col.startswith("mean_general_title_embeddings")]
        
        # Define overall column order.
        self.feature_order = (self.numeric_cols + self.article_emb_cols +
                              self.title_emb_cols + self.mean_gen_article_emb_cols +
                              self.mean_gen_title_emb_cols)
        
        # Group by ticker to build sliding windows.
        self.groups = {ticker: group.sort_values(by="Date").reset_index(drop=True)
                       for ticker, group in self.df.groupby("Ticker")}
        self.sequences = []
        for ticker, group in self.groups.items():
            if len(group) >= time_window:
                for i in range(len(group) - time_window + 1):
                    self.sequences.append((ticker, i))
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        ticker, start_idx = self.sequences[idx]
        group = self.groups[ticker].iloc[start_idx: start_idx + self.time_window]
        day_vectors = []
        for _, row in group.iterrows():
            vec = []
            for col in self.numeric_cols:
                vec.append(float(row[col]))
            for col in self.article_emb_cols:
                vec.append(row[col])
            for col in self.title_emb_cols:
                vec.append(row[col])
            for col in self.mean_gen_article_emb_cols:
                vec.append(row[col])
            for col in self.mean_gen_title_emb_cols:
                vec.append(row[col])
            day_vectors.append(vec)
        processed_days = []
        for day in day_vectors:
            numeric_part = np.array(day[:len(self.numeric_cols)], dtype=np.float32)
            start = len(self.numeric_cols)
            art_embs = day[start : start + len(self.article_emb_cols)]
            start += len(self.article_emb_cols)
            title_embs = day[start : start + len(self.title_emb_cols)]
            start += len(self.title_emb_cols)
            mean_art_embs = day[start : start + len(self.mean_gen_article_emb_cols)]
            start += len(self.mean_gen_article_emb_cols)
            mean_title_embs = day[start : ]
            
            art_flat = np.concatenate([np.array(x, dtype=np.float32).flatten() for x in art_embs]) if art_embs else np.array([], dtype=np.float32)
            title_flat = np.concatenate([np.array(x, dtype=np.float32).flatten() for x in title_embs]) if title_embs else np.array([], dtype=np.float32)
            mean_art_flat = np.concatenate([np.array(x, dtype=np.float32).flatten() for x in mean_art_embs]) if mean_art_embs else np.array([], dtype=np.float32)
            mean_title_flat = np.concatenate([np.array(x, dtype=np.float32).flatten() for x in mean_title_embs]) if mean_title_embs else np.array([], dtype=np.float32)
            
            day_vector = np.concatenate([numeric_part, art_flat, title_flat, mean_art_flat, mean_title_flat])
            processed_days.append(day_vector)
        X = np.stack(processed_days)
        y = group["Class"].iloc[-1]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Create dataset and dataloaders.
train_dataset_we = WETfDataset(LSTM_word_embeddings_transformed, selected_dates=train_dates, time_window=TIME_WINDOW)
test_dataset_we = WETfDataset(LSTM_word_embeddings_transformed, selected_dates=test_dates, time_window=TIME_WINDOW)
batch_size = 64
train_loader_we = DataLoader(train_dataset_we, batch_size=batch_size, shuffle=False)
test_loader_we = DataLoader(test_dataset_we, batch_size=batch_size, shuffle=False)

#############################################
# Cell 3b: Compute Dimensions for Model Input
#############################################
num_numeric = len(train_dataset_we.numeric_cols)
num_article = len(train_dataset_we.article_emb_cols)
num_title = len(train_dataset_we.title_emb_cols)
num_mean_art = len(train_dataset_we.mean_gen_article_emb_cols)
num_mean_title = len(train_dataset_we.mean_gen_title_emb_cols)

article_total_dim = num_article * EMBEDDING_SIZE
title_total_dim = num_title * EMBEDDING_SIZE
mean_art_total_dim = num_mean_art * EMBEDDING_SIZE
mean_title_total_dim = num_mean_title * EMBEDDING_SIZE

print(f"Numeric dim: {num_numeric}, Article total dim: {article_total_dim}, Title total dim: {title_total_dim}, Mean-Gen Article total dim: {mean_art_total_dim}, Mean-Gen Title total dim: {mean_title_total_dim}")

#############################################
# Cell 4: Define WE Transformed LSTM Model with 4 Branches for Embeddings
#############################################
class WE_Transformed_LSTM_4branch(nn.Module):
    def __init__(self, num_numeric, article_emb_dim, title_emb_dim, mean_art_emb_dim, mean_title_emb_dim, hidden_size=128, num_layers=3):
        """
        The LSTM will process per-day inputs that consist of:
          - A numeric part (length = num_numeric),
          - Plus 4 scalar outputs from 4 feedforward branches, each processing a different embedding category.
        """
        super(WE_Transformed_LSTM_4branch, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        # Feedforward network for article embeddings.
        self.article_net = nn.Sequential(
            nn.Linear(EMBEDDING_SIZE, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        # Feedforward network for title embeddings.
        self.title_net = nn.Sequential(
            nn.Linear(EMBEDDING_SIZE, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        # Feedforward for mean general article embeddings.
        self.mean_art_net = nn.Sequential(
            nn.Linear(EMBEDDING_SIZE, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        # Feedforward for mean general title embeddings.
        self.mean_title_net = nn.Sequential(
            nn.Linear(EMBEDDING_SIZE, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
        lstm_input_dim = num_numeric + 4  # 4 scalar outputs added to numeric features.
        self.lstm = nn.LSTM(lstm_input_dim, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc_out = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # x shape: [batch, time_window, day_feature_dim]
        # Split into sections according to known dimensions.
        numeric_part = x[:, :, :num_numeric]  # [batch, t, num_numeric]
        idx = num_numeric
        article_flat = x[:, :, idx: idx + article_total_dim]  # [batch, t, article_total_dim]
        idx += article_total_dim
        title_flat = x[:, :, idx: idx + title_total_dim]      # [batch, t, title_total_dim]
        idx += title_total_dim
        mean_art_flat = x[:, :, idx: idx + mean_art_total_dim]  # [batch, t, mean_art_total_dim]
        idx += mean_art_total_dim
        mean_title_flat = x[:, :, idx: idx + mean_title_total_dim]  # [batch, t, mean_title_total_dim]
        
        batch_size, t, _ = article_flat.shape
        
        # Reshape to [batch, time_window, PADDED_ROWS, EMBEDDING_SIZE] using .reshape(...).
        article_emb = article_flat.reshape(batch_size, t, PADDED_ROWS, EMBEDDING_SIZE)
        title_emb = title_flat.reshape(batch_size, t, PADDED_ROWS, EMBEDDING_SIZE)
        mean_art_emb = mean_art_flat.reshape(batch_size, t, PADDED_ROWS, EMBEDDING_SIZE)
        mean_title_emb = mean_title_flat.reshape(batch_size, t, PADDED_ROWS, EMBEDDING_SIZE)
        
        # Process each embedding category: merge batch, time, and PADDED_ROWS dimensions, then apply the branch.
        article_out = self.article_net(article_emb.reshape(-1, EMBEDDING_SIZE))  # shape: [batch*t*PADDED_ROWS, 1]
        article_out = article_out.reshape(batch_size, t, PADDED_ROWS, 1).mean(dim=2)  # [batch, t, 1]
        
        title_out = self.title_net(title_emb.reshape(-1, EMBEDDING_SIZE))
        title_out = title_out.reshape(batch_size, t, PADDED_ROWS, 1).mean(dim=2)
        
        mean_art_out = self.mean_art_net(mean_art_emb.reshape(-1, EMBEDDING_SIZE))
        mean_art_out = mean_art_out.reshape(batch_size, t, PADDED_ROWS, 1).mean(dim=2)
        
        mean_title_out = self.mean_title_net(mean_title_emb.reshape(-1, EMBEDDING_SIZE))
        mean_title_out = mean_title_out.reshape(batch_size, t, PADDED_ROWS, 1).mean(dim=2)
        
        # Concatenate numeric features with the four scalar outputs.
        augmented_input = torch.cat((numeric_part, article_out, title_out, mean_art_out, mean_title_out), dim=2)
        # augmented_input shape: [batch, time_window, num_numeric+4]
        
        # Process through LSTM.
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        lstm_out, _ = self.lstm(augmented_input, (h0, c0))
        out = self.fc_out(lstm_out[:, -1, :])
        return out.squeeze(1)

#############################################
# Cell 4b: Instantiate WE Transformed 4-Branch LSTM Model
#############################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
we_model = WE_Transformed_LSTM_4branch(num_numeric=num_numeric,
                                       article_emb_dim=article_total_dim,
                                       title_emb_dim=title_total_dim,
                                       mean_art_emb_dim=mean_art_total_dim,
                                       mean_title_emb_dim=mean_title_total_dim,
                                       hidden_size=128,
                                       num_layers=3).to(device)

print(we_model)
total_params = sum(p.numel() for p in we_model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {total_params}")

#############################################
# Cell 5: Set up Loss, Optimizer, Scheduler, and TensorBoard for WE Model
#############################################
criterion = nn.MSELoss()  # Regression loss
optimizer = optim.Adam(we_model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = f"runs/we_transformed_4branch_lstm_{timestamp}"
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)
writer = SummaryWriter(log_dir=log_dir)

#############################################
# Cell 6: Train WE Transformed 4-Branch LSTM Model
#############################################
def train_model_we(model, train_loader, val_loader, criterion, optimizer, epochs, scheduler=None):
    train_losses, val_losses = [], []
    prev_lr = optimizer.param_groups[0]['lr']
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        writer.add_scalars("Loss", {"Train": avg_train_loss, "Validation": avg_val_loss}, epoch)
        
        if scheduler is not None:
            scheduler.step(avg_val_loss)
            current_lr = optimizer.param_groups[0]['lr']
            if current_lr != prev_lr:
                print(f"LR changed to: {current_lr}")
                prev_lr = current_lr
        
        if epoch % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}] | Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    return train_losses, val_losses

num_epochs = 100
train_losses, val_losses = train_model_we(we_model, train_loader_we, test_loader_we,
                                          criterion, optimizer, num_epochs, scheduler)

plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_losses, label='Train')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation')
plt.title("WE Transformed 4-Branch Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.legend()
plt.show()

#############################################
# Cell 7: Evaluate WE Transformed 4-Branch Model (Modified Evaluation Section)
#############################################
def evaluate_model_we(model, loader):
    model.eval()
    true_labels, pred_values = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            pred_values.extend(outputs.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())
    return np.array(true_labels), np.array(pred_values)

we_true, we_pred = evaluate_model_we(we_model, test_loader_we)
we_pred_class = (we_pred >= 0).astype(int)
we_true_class = (we_true >= 0).astype(int)

# Compute confusion matrix and visualize it
we_cm = confusion_matrix(we_true_class, we_pred_class)
sns.heatmap(we_cm, annot=True, fmt='d', cmap='Blues')
plt.title("WE Transformed 4-Branch Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Compute evaluation metrics for the WE model
we_precision, we_recall, we_f1, _ = precision_recall_fscore_support(we_true_class, we_pred_class, average='binary')
we_accuracy = accuracy_score(we_true_class, we_pred_class)
print(f"WE Model - Precision: {we_precision:.4f}, Recall: {we_recall:.4f}, F1: {we_f1:.4f}, Accuracy: {we_accuracy:.4f}")

# Precision/Recall Bar Plot for WE Transformed model (without hue)
plt.figure(figsize=(8, 5))
we_metrics = [we_precision, we_recall, we_f1, we_accuracy]
we_labels = ["Precision", "Recall", "F1-Score", "Accuracy"]
sns.barplot(x=we_labels, y=we_metrics, palette="viridis")
plt.ylim(0, 1)
plt.title("WE Transformed 4-Branch Evaluation Metrics")
plt.ylabel("Score")
plt.show()

# Save evaluation results into a dictionary 
we_eval_results = {
    'true_labels': we_true,
    'pred_values': we_pred,
    'predicted_classes': we_pred_class,
    'confusion_matrix': we_cm,
    'precision': we_precision,
    'recall': we_recall,
    'f1_score': we_f1,
    'accuracy': we_accuracy
}

#############################################
# Cell 8: Visualize WE Model Weight Distributions & Log to TensorBoard
#############################################
for name, param in we_model.named_parameters():
    if "weight" in name:
        writer.add_histogram(name, param.data.cpu().numpy())
writer.add_graph(we_model, next(iter(train_loader_we))[0].to(device))
writer.add_scalar("Eval/Precision", we_precision, num_epochs)
writer.add_scalar("Eval/Recall", we_recall, num_epochs)
writer.add_scalar("Eval/F1", we_f1, num_epochs)
writer.add_scalar("Eval/Accuracy", we_accuracy, num_epochs)
writer.close()

#############################################
# Cell 9: Save WE Transformed 4-Branch Model
#############################################
torch.save(we_model.state_dict(), "stock_we_transformed_4branch_lstm_model.pth")
print("💾 WE Transformed 4-Branch LSTM Model saved!")

Numeric dim: 240, Article total dim: 15360, Title total dim: 15360, Mean-Gen Article total dim: 15360, Mean-Gen Title total dim: 15360
WE_Transformed_LSTM_4branch(
  (article_net): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
  (title_net): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
  (mean_art_net): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
  (mean_title_net): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
  (lstm): LSTM(244, 128, num_layers=3, batch_first=True, dropout=0.2)
  (fc_out): Linear(in_features=128, out_features=1, bias=True)
)
Number of trainable parameters: 1408133
Epoch [1/100] | Train Loss: 0.0060, Val Loss: 0.0072
Epoch [11/100] | Train Loss: 0.0027, Val Loss: 0.0043
LR changed to: 0.0005
Epoch [21/100] | Train Loss: 0.0013, Val Loss: 0.0027
LR changed to: 0.00025
Epoch [31/100] | Train Loss: 0.0009, Val Loss: 0.0019
Epoch [41/100] | Train Loss: 0.0006, Val Loss: 0.0014
Epoch [51/100] | Train Loss: 0.0008, Val Loss: 0.0042
LR changed to: 0.000125
Epoch [61/100] | Train Loss: 0.0004, Val Loss: 0.0006
LR changed to: 6.25e-05
Epoch [71/100] | Train Loss: 0.0004, Val Loss: 0.0007
LR changed to: 3.125e-05
Epoch [81/100] | Train Loss: 0.0003, Val Loss: 0.0005
Epoch [91/100] | Train Loss: 0.0003, Val Loss: 0.0003

WE Model - Precision: 0.9836, Recall: 0.5788, F1: 0.7287, Accuracy: 0.8532

C:\Users\argam\AppData\Local\Temp\ipykernel_17448\2900690780.py:372: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=we_labels, y=we_metrics, palette="viridis")

💾 WE Transformed 4-Branch LSTM Model saved!

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)
import math
# ------------------------------------------------------------------------------
# Metrics Bar Chart
# ------------------------------------------------------------------------------
model_names = ['WE Transformed', 'Bigram', 'POS Model']
metrics_dict = {
    'Precision': [we_precision, bigram_precision, pos_precision],
    'Recall':    [we_recall,    bigram_recall,    pos_recall],
    'F1-Score':  [we_f1,        bigram_f1,        pos_f1],
    'Accuracy':  [we_accuracy,  bigram_accuracy,  pos_accuracy]
}

x = np.arange(len(model_names))
width = 0.2
colors = sns.color_palette("Set2", n_colors=len(metrics_dict))

fig, ax = plt.subplots(figsize=(12, 7))
for i, (metric, values) in enumerate(metrics_dict.items()):
    bars = ax.bar(x + i*width, values, width, label=metric, color=colors[i])
    for bar in bars:
        ax.annotate(f'{bar.get_height():.2f}',
                    xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                    xytext=(0,5), textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

ax.set_xlabel('Models')
ax.set_ylabel('Score')
ax.set_title('Evaluation Metric Comparison Across Models')
ax.set_xticks(x + width*(len(metrics_dict)-1)/2)
ax.set_xticklabels(model_names)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.set_ylim(0,1.05)
ax.legend(title="Metrics")
ax.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------
# Confusion Matrices
# ------------------------------------------------------------------------------
def plot_confusion(ax, true_labels, pred_labels, title):
    cm = confusion_matrix(true_labels, pred_labels, labels=[0,1])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=[0,1], yticklabels=[0,1], cbar=False)
    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

fig, axs = plt.subplots(1, 3, figsize=(18, 5))
plot_confusion(axs[0], we_true_class,       we_pred_class,      "WE Transformed")
plot_confusion(axs[1], bigram_y_true_class, bigram_y_pred_class,"Bigram")
plot_confusion(axs[2], pos_y_true_class,    pos_y_pred_class,   "POS Model")
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------
# Score Distributions by True Class 
# ------------------------------------------------------------------------------
fig, axes = plt.subplots(1, 3, figsize=(18, 4), sharey=True)
for ax, name, y_score, y_true in zip(
        axes,
        model_names,
        [we_pred, bigram_y_pred, pos_y_pred],
        [we_true_class, bigram_y_true_class, pos_y_true_class]
    ):
    df = pd.DataFrame({'score': y_score, 'true': y_true})
    sns.kdeplot(
        data=df, x='score', hue='true', fill=True, common_norm=False,
        palette=['C0','C1'], alpha=0.5, ax=ax
    )
    ax.set_title(f"{name}\nScore Distribution by True Class")
    ax.set_xlabel("Predicted Score")
    ax.set_ylabel("Density")
    ax.legend(title='True', labels=['0','1'])
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------
# Prediction Error Distributions
# ------------------------------------------------------------------------------
errors = [
    we_pred - we_true,
    bigram_y_pred - bigram_y_true,
    pos_y_pred - pos_y_true
]
fig, axes = plt.subplots(1, 3, figsize=(18, 4), sharey=True)
for ax, name, err in zip(axes, model_names, errors):
    sns.histplot(err, kde=True, stat='density', alpha=0.6, ax=ax)
    ax.set_title(f"{name}\nPrediction Error Distribution")
    ax.set_xlabel("Error = Predicted − True")
    ax.set_ylabel("Density")
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------
# Predicted vs. True Scatter
# ------------------------------------------------------------------------------
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharex=True, sharey=True)
for ax, name, y_true, y_pred in zip(
        axes, model_names,
        [we_true, bigram_y_true, pos_y_true],
        [we_pred, bigram_y_pred, pos_y_pred]
    ):
    ax.scatter(y_true, y_pred, alpha=0.6, s=20)
    mn = min(y_true.min(), y_pred.min())
    mx = max(y_true.max(), y_pred.max())
    ax.plot([mn, mx], [mn, mx], 'k--', lw=1)
    ax.set_title(f"{name}\nPredicted vs. True")
    ax.set_xlabel("True")
    ax.set_ylabel("Predicted")
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------  
# Pairwise Model Output Comparisons (Refined Distinct Colors + Dynamic Scaling)  
# ------------------------------------------------------------------------------  
pairs = [
    ('WE Transformed', 'Bigram', we_pred, bigram_y_pred),
    ('WE Transformed', 'POS Model', we_pred, pos_y_pred),
    ('Bigram', 'POS Model', bigram_y_pred, pos_y_pred)
]

colors = ['#1f77b4', '#ff7f0e']  # Blue and Orange

fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharex=False, sharey=False)

for ax, (name_a, name_b, pa, pb) in zip(axes, pairs):
    # Scatter plots for both directions
    ax.scatter(pa, pb, alpha=0.6, s=25, c=colors[0], label=name_a)
    ax.scatter(pb, pa, alpha=0.6, s=25, c=colors[1], label=name_b)

    # Dynamic axis limits
    min_val = min(pa.min(), pb.min())
    max_val = max(pa.max(), pb.max())
    ax.set_xlim(min_val, max_val)
    ax.set_ylim(min_val, max_val)

    # Diagonal line
    ax.plot([min_val, max_val], [min_val, max_val], 'k--', lw=1)

    # Labels and title
    ax.set_title(f"{name_a} vs {name_b}\nPredicted Scores")
    ax.set_xlabel(f"{name_a} Score")
    ax.set_ylabel(f"{name_b} Score")
    ax.legend()

plt.tight_layout()
plt.show()

Feature	Count	Mean	Std	Min	25%	50%	75%	Max
Close	3774	~0.00	1.00	-1.46	-0.87	-0.07	0.53	3.27
Class	3774	~0.00	0.04	-0.33	-0.013	-0.00012	0.014	0.42

Column	Min	Max
`Open`	-1.47	3.32
`Close`	-1.46	3.27
`High`	-1.45	3.27
`Low`	-1.47	3.25
`Volume`	0.0	1.0
`RSI`	0.0	96.20
`SMA`	-1.41	2.98
`%K`	-69.47	280.99
`Class`	-0.33	0.42

Metric	WE Transformed	Bigram	POS Model
Precision	0.9670	0.9593	0.9897
Recall	0.5659	0.3794	0.6206
F1-Score	0.7140	0.5438	0.7628
Accuracy	0.8456	0.7831	0.8686

Model	True Negatives (TN)	False Positives (FP)	False Negatives (FN)	True Positives (TP)
WE Transformed	596	6	135	176
Bigram	597	5	193	118
POS Model	600	2	118	193

Comparing Preprocessing Methods of Textual Data for Semantic Analysis in Stock Trend Predictions¶

Abstract¶

Introduction¶

Setting requirements¶

Pip, Imports and constants¶

NLP for Market Analysis¶

Data Collection and Feature Extraction¶

School 1: Technical + Textual Data¶

School 2: Textual Data Only¶

Preprocessing: From Text to Numerical Data¶

Tokenization¶

POS Tagging and Chunking¶

Bag of Words (BoW)¶

N-Grams¶

Text Normalization¶

Stemming vs. Lemmatization¶

Stop Word Removal¶

Word Embedding (WE): Advanced Vectorization¶

Benefits of WE:¶

WE Techniques:¶

Evaluation of Embeddings:¶

Challenges:¶

Popular Pre-trained Embeddings:¶

Applications of Word Embedding:¶

Future Directions:¶

Preprocessing Examples in Research¶

Tradeoffs in Preprocessing:¶

Textual data preparations¶

Data Exploration & Visualization All the News 2.1 Dataset Review¶

plotting visualization of count duplicates and missing values ​​in news data set¶

Data cleaning¶

Creating Financial News Data¶

Categorizing Financial News Articles¶

Visualize new categorised data¶

Aligning News Data with NYSE Trading Calendar¶

Dataset Characteristics and Distribution Insights¶

Controlled Sampling Strategy: Why 20 Articles per Day?¶

Text Normalization and Dimensionality Reduction in NLP¶

Text normalization¶

Stemming¶

Word Cloud Visualization¶

Unique data frames cration¶

Part-of-speech (POS) labeling:¶

N-grams:¶

Advanced Dimensionality Reduction Techniques in NLP: Word Embedding:¶

Dimensionality Reduction and Visualization of Word Embeddings¶

Plot the embeddings¶

Calculate sentiment scores for each DataFrame:¶

FinBERT-Based Sentiment Analysis¶

remove last Rows to lower dimantions¶

Financial data¶

Dual Approaches in Financial NLP Research¶

1. Technical + Textual Data Integration¶

2. Text-Only Approaches¶

Key Technical Indicators Used in Forecasting¶

Relative Strength Index (RSI)¶

Simple Moving Average (SMA)¶

Exponential Moving Average (EMA)¶

Moving Average Convergence Divergence (MACD)¶

On-Balance Volume (OBV)¶

Adjusted Closing Price¶

Augmenting Technical Data with NLP Features¶

Example: Multi-Source Textual Data¶

Yahoo Finance Review¶

Overview¶

Comprehensive Data¶

User-Friendly Interface¶

Free Basic Features¶

Limited Advanced Features¶

Yahoo Finance Premium (Plus)¶

Features¶

Conclusion¶

Normalization¶

Setting final merged data frame¶

The Models¶

RNN & LSTM:¶

POS Model¶

Bigrams Model¶

Word Embedding Model¶

Comparation of Models¶

plotting visualization of count duplicates and missing values in news data set¶