Importing Pandas library
Business understandingimport pandas as pd
Importing train_test_split library
Business understandingfrom sklearn.model_selection import train_test_split
Importing DecisionTreeClassifier library
Business understandingfrom sklearn.tree import DecisionTreeClassifier
Importing metrics library
Business understandingfrom sklearn import metrics
Importing matplotlib library
Business understandingimport matplotlib.pyplot as plt
Importing Counter library
Business understandingfrom collections import Counter
Importing confusion_matrix library
Business understandingfrom sklearn.metrics import confusion_matrix
Importing Pandas library
Business understandingimport pandas as pd
Importing train_test_split library
Business understandingfrom sklearn.model_selection import train_test_split
Importing DecisionTreeClassifier library
Business understandingfrom sklearn.tree import DecisionTreeClassifier
Importing metrics library
Business understandingfrom sklearn import metrics
Importing NumPy library
Business understandingimport numpy as np
Importing Seaborn library
Business understandingimport seaborn as sns
Importing Pandas library
Business understandingimport pandas as pd
Importing train_test_split library
Business understandingfrom sklearn.model_selection import train_test_split
Importing DecisionTreeClassifier library
Business understandingfrom sklearn.tree import DecisionTreeClassifier
Importing metrics library
Business understandingfrom sklearn import metrics
Importing Counter library
Business understandingfrom collections import Counter
Importing confusion_matrix library
Business understandingfrom sklearn.metrics import confusion_matrix
Importing NumPy library
Business understandingimport numpy as np
Importing accuracy_score library
Business understandingfrom sklearn.metrics import accuracy_score
Importing precision_recall_fscore_support
Business understandingfrom sklearn.metrics import precision_recall_fscore_support
Importing Pandas library
Business understandingimport pandas as pd
Importing train_test_split library
Business understandingfrom sklearn.model_selection import train_test_split
Importing matplotlib library
Business understandingimport matplotlib.pyplot as plt
Importing NumPy library
Business understandingimport numpy as np
Importing DecisionTreeRegressor for regression prediction
Business understandingfrom sklearn.tree import DecisionTreeRegressor
Importing Pandas library
Business understandingimport pandas as pd
Importing train_test_split library
Business understandingfrom sklearn.model_selection import train_test_split
Importing Counter library
Business understandingfrom collections import Counter
Importing StandardScaler
Business understandingfrom sklearn.preprocessing import StandardScaler
Importing LogisticRegression
Business understandingfrom sklearn.linear_model import LogisticRegression
Importing matplotlib library
Business understandingimport matplotlib.pyplot as plt
Importing NumPy library
Business understandingimport numpy as np
Importing numpy library for array operations
Business understandingfrom numpy import array
Importing colors for visualization
Business understandingfrom matplotlib.colors import ListedColormap
Setting inline mode for Jupyter plot display
Business understanding%matplotlib inline
Importing library for random value generation
Business understandingimport random
Importing matplotlib library
Business understandingimport matplotlib.pyplot as plt
Importing NumPy library
Business understandingimport numpy as np
Getting input data matrix shape
Data preparationm, n = X.shape
Creating bias column for data matrix
Data preparationbias = np.ones((X.shape[0], 1))
Expanding data matrix with bias column
Data preparationbiased_X = np.hstack((bias, X))
Initializing random number generator with fixed seed
Data preparationrandom_gen = np.random.RandomState(1)
Importing Pandas library
Business understandingimport pandas as pd
Importing train_test_split library
Business understandingfrom sklearn.model_selection import train_test_split
Importing Keras Dense layer for model building
Business understandingfrom tensorflow.keras.layers import Dense
Importing Keras Sequential API
Business understandingfrom tensorflow.keras import Sequential
Importing Pandas library
Business understandingimport pandas as pd
Importing train_test_split library
Business understandingfrom sklearn.model_selection import train_test_split
Importing Counter library
Business understandingfrom collections import Counter
Importing NumPy library
Business understandingimport numpy as np
Importing Seaborn library
Business understandingimport seaborn as sns
Importing Keras Dense layer for model building
Business understandingfrom tensorflow.keras.layers import Dense
Importing Keras Sequential API
Business understandingfrom tensorflow.keras import Sequential
Importing one-hot encoding tools
Business understandingfrom tensorflow.keras.utils import to_categorical
Importing Pandas library
Business understandingimport pandas as pd
Importing regex for text processing
Business understandingimport re
Importing Pandas library
Business understandingimport pandas as pd
Importing Counter library
Business understandingfrom collections import Counter
Importing regex for text processing
Business understandingimport re
Importing NLP toolkit (NLTK)
Business understandingimport nltk
Importing HTTP request library
Business understandingimport requests
Importing HTML parser from lxml
Business understandingfrom lxml import html
Downloading NLTK tokenizer module
Business understandingnltk.download('punkt')
Importing word tokenizer
Business understandingfrom nltk.tokenize import word_tokenize
Loading dataset
Data understandingdiabetes = pd.read_csv('diabetes_inbalanced.csv', index_col=0)
Loading dataset
Data understandingtitanic = pd.read_csv('titanic.csv')
Loading dataset
Data understandingdiabetes = pd.read_csv('diabetes_inbalanced.csv', index_col=0)
Loading dataset
Data understandingdf = pd.read_csv('ice_cream_data.csv', sep=";")
Loading dataset
Data understandingdf = pd.read_csv('Heart.csv')
Defining training data (input examples and expected outputs)
Data preparationtraining_data = [
(array([121,16.8]), 1),
(array([114,15.2]), 1),
(array([210,9.4]), -1),
(array([195,8.1]), -1),
]
Alternative training set for XOR problem
Data preparationtraining_data = [
(array([3,-2]), -1),
(array([3,1]), 1),
(array([2,0]), -1),
]
Generating linearly separable data with two classes
Data preparationX, y = datasets.make_blobs(n_samples=100,n_features=2,
centers=2,cluster_std=1,
random_state=3)
Test data for model error calculation
Data preparationmal_byt = np.array([1,2,3,4])
bol = np.array([1,0,2,5])
Loading dataset
Data understandingtitanic = pd.read_csv('titanic.csv')
Loading javelin throw dataset
Data understandingdata = pd.read_csv('darts.csv')
Opening text file for reading
Data understandingtext_file = open('human_rights.txt', 'r')
Loading text file content
Data understandingh_rights = text_file.read()
Loading tweets from CSV file
Data understandingtweets = pd.read_csv("tweets.csv")
Sample tweet for regex demonstration
Data preparationtweet = "@nltk T awesome! #regex #pandas #python"
Sample text for NLP operations
Data preparationtext = "The cat is in the box. The cat likes the box. The box is over the cat."
Printing contents of 'person' variable
Data preparationosoba
Displaying scaled training data
Data understandingX_train_scaled
Displaying first 4 rows of expanded matrix
Data preparationbiased_X[:4]
Displaying first 4 predicted values
Data preparationoutput_pred[:4]
Displaying first 4 model errors
Data preparationerrors[:4]
Encoding gender
Data preparationtitanic['Sex'] = titanic['Sex'].replace({'male': 0, 'female': 1})
Reshaping input data to 2D array for model compatibility
Data preparationosoba = osoba.reshape(1,-1)
Removing 'Unnamed: 0' column
Data preparationdf = df.drop(columns='Unnamed: 0')
Converting ChestPain category to numerical values
Data preparationdf['ChestPain'] = df['ChestPain'].astype('category')
df['ChestPain'] = df['ChestPain'].cat.codes
Converting Thal category to numerical values
Data preparationdf['Thal'] = df['Thal'].astype('category')
df['Thal'] = df['Thal'].cat.codes
Converting AHD category to numerical values
Data preparationdf['AHD'] = df['AHD'].astype('category')
df['AHD'] = df['AHD'].cat.codes
Scaling training data
Data preparationX_train_scaled = scaler.fit_transform(X_train)
Scaling test data
Data preparationX_test_scaled = scaler.transform(X_test)
Encoding gender
Data preparationtitanic['Sex'] = titanic['Sex'].replace({'male': 0, 'female': 1})
Extending class encoding for multi-class classification
Data preparationmulti = data
multi["competitor"]=multi["competitor"].replace({'Steve':0,'Susan':1,'Michael':2,'Kate':3})
Selecting all dataset features except target variable, defining target variable
Data preparationX = diabetes[diabetes.columns.difference(['Outcome'])]
y = diabetes['Outcome']
y=y.astype('int')
Splitting data into training/test sets
Data preparationX = titanic[titanic.columns.difference(['Survived'])]
y = titanic['Survived']
y=y.astype('int')
Defining test vector representing individual with various attributes
Data preparationosoba = np.array([10, #age
0, #fare
0, #parent/children
1, #pclass
0, #sex
3]) #siblings/spouses
Selecting all dataset features except target variable, defining target variable
Data preparationX = diabetes[diabetes.columns.difference(['Outcome'])]
y = diabetes['Outcome']
y=y.astype('int')
Separating input features (X) from target variable (y)
Data preparationX = df.drop(['Revenue'], axis = 1)
y = df['Revenue']
Selecting Temperature and Revenue attributes
Data preparationX = df['Temperature'].values
y = df['Revenue'].values
Creating input vector for visualization
Data preparationvstup = df.drop(["Revenue"], axis=1)
Separating input features (X) from target variable (y)
Data preparationX = df.drop(columns = "AHD")
y = df['AHD']
Splitting data into training/test sets
Data preparationX = titanic[titanic.columns.difference(['Survived'])]
y = titanic['Survived']
y=y.astype('int')
Separating input features from target variable
Data preparationX = vyber[vyber.columns.difference(['competitor'])]
y = vyber['competitor']
y=y.astype('int')
Preparing data for 4-class classification
Data preparationX = multi[multi.columns.difference(['competitor'])]
y = to_categorical(multi['competitor'])
Splitting data into training and test sets
Data preparationX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Splitting data into training and test sets
Data preparationX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Splitting data into training and test sets
Data preparationX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Splitting data into training and test sets
Data preparationX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Splitting data into train/test sets
Data preparationX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)
Splitting data into training and test sets
Data preparationX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Splitting data into train/test sets
Data preparationX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
Creating decision tree model with gini criterion
Modelingclf = DecisionTreeClassifier(criterion='gini')
Training decision tree
Modelingclf = DecisionTreeClassifier()
Training decision tree
Modelingclf = DecisionTreeClassifier()
Creating decision tree model
Modelingregressor = DecisionTreeRegressor()
Creating StandardScaler model
Modelingscaler = StandardScaler()
Creating and training logistic model
Modelinglog_reg = LogisticRegression(random_state=0).fit(X_train_scaled, y_train)
Training decision tree on training data
Modelingclf = clf.fit(X_train,y_train)
Training decision tree on training data
Modelingclf = clf.fit(X_train,y_train)
Training decision tree on training data
Modelingclf = clf.fit(X_train,y_train)
Training decision tree
Modelingregressor.fit(X_train, y_train)
Training model for 50 epochs
Modelingmodel.fit(X_train, y_train, epochs=50)
Training model for 200 epochs
Modelingmodel.fit(X_train, y_train, epochs=200)
Predicting values on test data
Modelingy_pred = clf.predict(X_test)
Predicting values on test data
Modelingy_pred = clf.predict(X_test)
Using decision tree model to predict for 'person'
Modelingclf.predict(osoba)
Predicting values on test data
Modelingy_pred = clf.predict(X_test)
Predicting values on test data
Modelingy_pred = regressor.predict(X_test)
Predicting values on training data
Modelinglog_reg.predict(X_train_scaled)
Predicting probabilities for test data
Modelinglog_reg.predict_proba(X_test_scaled)
Predicting output for custom input vector
Modelingvstup_q = np.array([[-4,8]])
classifier.predict(vstup_q)
Using model to predict test data (X_test)
Modelingpredictions = model.predict(X_test)
Generating predictions for test data
Evaluationy_pred = model.predict(X_test).round()
Generating predictions for test data
Evaluationy_pred = model.predict(X_test).round()
Evaluating model accuracy
Evaluationprint("Presnosť:",metrics.accuracy_score(y_test, y_pred))
Creating confusion matrix to analyze correct/incorrect predictions
Evaluationconfusion_matrix(y_test, y_pred, labels=[1,0])
Evaluating model accuracy
Evaluationprint("Presnosť:",metrics.accuracy_score(y_test, y_pred))
Displaying maximum depth of trained decision tree
Evaluationclf.get_depth()
Listing all parameters of trained decision tree
Evaluationclf.get_params()
Creating confusion matrix to analyze correct/incorrect predictions
Evaluationconfusion_matrix(y_test, y_pred, labels=[1,0])
Evaluating model accuracy
Evaluationaccuracy_score(y_test, y_pred)
Calculating precision, recall, f1-score and support
Evaluationprecision_recall_fscore_support(y_test, y_pred, labels=[1, 0])
Printing precision, recall, f1-score and support
Evaluationp, r, f, s = precision_recall_fscore_support(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('acc: ',metrics.accuracy_score(y_test, y_pred))
print('prec: ',((p[0]+p[1])/2),'(',p[0],' / ',p[1],')')
print('rec: ',((r[0]+r[1])/2),'(',r[0],' / ',r[1],')')
print('f1-sc:',((f[0]+f[1])/2))
print(confusion_matrix(y_test, y_pred))
Creating DataFrame for result comparison
Evaluationd = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred})
Calculating residual squares
Evaluationd['sqr_res'] = pow((d['Real Values'] - d['Predicted Values']), 2)
Summing residual squares
Evaluationd['sqr_res'].sum()
Evaluating model accuracy on training data
Evaluationlog_reg.score(X_train_scaled, y_train)
Evaluating model accuracy on test data
Evaluationlog_reg.score(X_test_scaled, y_test)
Evaluating model accuracy
Evaluationprint("Presnosť:",metrics.accuracy_score(y_test, y_pred))
Displaying first 6 predictions
Evaluationpreds[:6]
Displaying first 6 actual values
Evaluationy_test[:6]
Converting predictions to class labels
Evaluationlabels_predict=np.argmax(y_pred,axis=1)
labels_predict[:6]
Creating confusion matrix for evaluation
Evaluationconfusion_matrix(labels_predict, np.argmax(y_test, axis=1))
Printing classifier accuracy
Evaluationprint("Presnost: ",metrics.accuracy_score(labels_predict, np.argmax(y_test, axis=1)))
Visualizing decision tree
Deploymentplt.figure(figsize=(40,20))
plot_tree(regressor, feature_names=vstup.columns.tolist())
Displaying basic statistical values of dataset
Data understandingdiabetes.describe()
Counting value frequency of attribute
Data understandingCounter(diabetes.Outcome)
Displaying basic statistical values of dataset
Data understandingdiabetes.describe()
Counting value frequency of attribute
Data understandingCounter(diabetes.Outcome)
Counting frequency of 'AHD' attribute values
Data understandingCounter(df.AHD)
Checking for missing values
Data understandingdf.isnull().sum()
Analyzing class distribution in data
Data preparationCounter(vyber.competitor)
Displaying loaded text content
Data understandingh_rights
Calculating total character count
Data preparationlen(h_rights)
Calculating number of unique words
Evaluationlen(set(h_rights.split()))
Finding longest word in text
Evaluationmax = 0
for w in slova:
if len(w)>max:
max = len(w)
Displaying tweet dataset structure
Data understandingtweets.head()
Applying word count function to entire dataset
Data preparationtweets['word_count'] = tweets.apply(lambda x: tweet_count(x), axis = 1)
Calculating tweet character counts
Data preparationtweets['char_count'] = tweets['tweet'].str.len()
Calculating average word length in tweets
Evaluationtweets['avg_len'] = (tweets['char_count'] - (tweets['word_count'] - 1)) / tweets['word_count']
Displaying decision path taken for 'person' prediction
Deploymentclf.decision_path(osoba).toarray()
Defining decision boundary line for visualization
Evaluationdef priamka(x):
y = (W[0]*x + b)/(W[1]*(-1))
return y
Printing model structure
Modelingmodel.summary()
Printing model structure
Modelingmodel.summary()
Visualizing actual vs predicted values
Deploymentplt.scatter(X_test, y_test, color='red')
plt.scatter(X_test, y_pred, color='green')
plt.title('Decision Tree Regression')
plt.xlabel('Temperature')
plt.ylabel('Revenue')
plt.show()
Creating grid for smoother visualization
DeploymentX_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
Visualizing decision tree
Deploymentplt.plot(X_grid, regressor.predict(X_grid), color='black')
plt.title('Decision Tree Regression')
plt.xlabel('Temperature')
plt.ylabel('Revenue')
plt.show()
Visualizing training data and decision boundary
Evaluationcm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot()
ax.set_title("Result")
for x, expected in training_data:
if expected==1:
vzor='r'
else:
vzor='b'
# print(x[0])
ax.scatter(x[0], x[1], color=vzor)
plt.plot([100,300],[priamka(100),priamka(300)])
plt.show()
Visualizing decision boundary for XOR problem
Evaluationcm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot()
ax.set_title("Result")
for x, expected in training_data:
if expected==1:
vzor='r'
else:
vzor='b' ax.scatter(x[0], x[1], color=vzor)
plt.plot([0,8],[priamka(0),priamka(8)])
plt.show()
Visualizing loss vs epochs relationship
Evaluationplt.plot(range(1, len(classifier.cost) + 1), classifier.cost)
plt.title("Adaline: learn-rate 0.001")
plt.xlabel('Epochs')
plt.ylabel('Cost (Sum-of-Squares)')
plt.show()
Visualizing pairwise variable relationships
Evaluationsns.pairplot(data, hue='competitor')
Displaying dataset
Data understandingdiabetes
Displaying first 5 dataset rows for quick overview
Data understandingtitanic.head()
Defining activation function for perceptron (step 0: biological neuron inspiration)
Modelingdef aktivacna_fn(x):
if x>=0:
return 1
else:
return -1
Calculating neuron output (weighted sum of inputs + bias)
Modelingdef neuron(X,W,b):
return aktivacna_fn(np.dot(X,W) + b)
Initializing weights and bias with random values
ModelingW = array([-30,300])
b = -1230
eta = 0.01
print('aktualne vahy: ' , W)
print('bias: ', b)
Training perceptron using delta rule (weight updates based on error)
Modelingfor i in range(0, 4):
print('---')
x, y = training_data[i]
print('trenovacie data: ' , x , ', vysledok: ', y)
predikcia = neuron(x,W,b)
print('predikcia: ',predikcia)
chyba = y - predikcia
if (chyba != 0):
print('potrebne je upravit vahy')
W = W + (eta * chyba * x)
b = b + (eta * chyba * 1)
print('aktualne vahy: ' , W)
print('bias: ', b)
Predicting output for custom input vector
Modelingvektor = array ([100, 10])
neuron(vektor, W, b)
Initializing weights and bias with random values (XOR problem)
Modelingr1 = random.randint(-100, 100)
r2 = random.randint(-100, 100)
W = array([r1,r2])
b = random.randint(-100, 100)
eta = 0.5
print('aktualne vahy: ' , W)
print('bias: ', b)
Training perceptron in epochs (iterating through training data)
Modelinguprava_vahy = True
epocha_id = 1
while uprava_vahy:
print('epocha: ', epocha_id)
epocha_id += 1
uprava_vahy = False
for i in range(0, 3):
print('---')
x, y = training_data[i]
predikcia = neuron(x,W,b) chyba = y - predikcia if (chyba != 0):
uprava_vahy = True
W = W + (eta * chyba * x)
b = b + (eta * chyba * 1)
print('aktualne vahy: ' , W, ', bias: ', b)
Defining sum of squared errors function
Modelingdef sum_squared_errors(y, output_pred):
errors = y - output_pred
return (errors**2).sum()/2.0
Calculating errors between expected and predicted values
Modelingsum_squared_errors(mal_byt, bol)
Calculating weighted sum of inputs (neuron's internal potential)
Modelingdef vnutorny_potencial(X, weights):
return np.dot(X, weights)
Defining linear activation function for Adaline (identity function)
Modelingdef aktivacna_fn(x):
return x
Generating initial weights from normal distribution
Modelingweights = random_gen.normal(loc = 0.0, scale = 0.01, size = biased_X.shape[1])
Initializing list for storing errors and calculating predictions
Modelingcost = []
learn_rate = 0.5
output_pred = aktivacna_fn(vnutorny_potencial(biased_X, weights))
Calculating errors between actual and predicted values
Modelingerrors = y - output_pred
Updating weights using gradient descent
Modelingweights += (learn_rate * biased_X.T.dot(errors))
Displaying updated model weights
Modelingweights
Calculating loss using sum of squared errors
Evaluationcost_i = (errors**2).sum() /2.0
cost_i = sum_squared_errors(y,output_pred)
Training Adaline model for 20 epochs
Modelingfor i in range(20):
output_pred = aktivacna_fn(vnutorny_potencial(biased_X, weights))
errors = y - output_pred
weights += (learn_rate * biased_X.T.dot(errors))
cost_i = (errors**2).sum() / 2.0
cost.append(cost_i)
Implementing Adaline algorithm with automatic data scaling
Modelingclass Adaline(object):
def __init__(self, learn_rate = 0.001, iterations = 10000):
self.learn_rate = learn_rate
self.iterations = iterations
def fit(self, X, y, biased_X = False, standardised_X = False):
if not standardised_X:
X = self._standardise_features(X)
if not biased_X:
X = self._add_bias(X)
self._initialise_weights(X)
self.cost = []
for cycle in range(self.iterations):
output_pred = self._activation(self._net_input(X))
errors = y - output_pred
self.weights += (self.learn_rate * X.T.dot(errors))
cost = (errors**2).sum() / 2.0
self.cost.append(cost)
return self
def _net_input(self, X):
return np.dot(X, self.weights)
def predict(self, X, biased_X=False):
if not biased_X:
X = self._add_bias(X)
return np.where(self._activation(self._net_input(X)) >= 0.0, 1, 0)
def _add_bias(self, X):
bias = np.ones((X.shape[0], 1))
biased_X = np.hstack((bias, X))
return biased_X
def _initialise_weights(self, X):
random_gen = np.random.RandomState(1)
self.weights = random_gen.normal(loc = 0.0, scale = 0.01, size = X.shape[1])
return self
def _standardise_features(self, X):
X_norm = (X - np.mean(X, axis=0)) / np.std(X, axis = 0)
return X_norm
def _activation(self, X):
return X
Creating and training Adaline classifier
Modelingclassifier = Adaline(learn_rate = 0.001, iterations = 100)
a = classifier.fit(X, y)
Displaying final trained model weights
Modelinga.weights
Defining sequential model with three layers
Modelingmodel = Sequential()
model.add(Dense(48,input_shape=(6,),activation="sigmoid"))
model.add(Dense(6,activation="sigmoid"))
model.add(Dense(1))
Compiling model with Adam optimizer and MSE loss
Modelingmodel.compile(optimizer="adam", loss="mse")
Defining model for multi-class classification
Modelingmodel = Sequential()
model.add(Dense(4,input_shape=(2,),activation="relu"))
model.add(Dense(4,activation="softmax"))
Compiling model with binary cross-entropy
Modelingmodel.compile(optimizer="adam", loss="binary_crossentropy")
Defining lambda function for simple addition
Modelingx = lambda a: a + 100
Cleaning text (removing special characters/punctuation)
Data preparationh_rights = h_rights.replace('\n', ' ')
h_rights = h_rights.replace("\ufeff", ' ')
h_rights = h_rights.replace(',', ' ')
h_rights = h_rights.replace('.', ' ')
Tokenizing text into words
Data preparationslova = h_rights.split()
Function for counting words in tweets
Modelingdef tweet_count(row):
my_var = row['tweet']
return len(my_var.split())
Basic regex match test
Modelingre.match('abc','abcdefgh')
Detecting hashtags with regex
Modelingre.search('#[A-Za-z0-9]+', tweet)
Extracting all hashtags from tweet
Modeling[w for w in tweet.split() if re.search('#[A-Za-z0-9]+', w)]
Demonstrating regex findall for 'b.+ing' pattern
Modelingsentence1 = "In the beginning was the Word"
re.findall("b.+ing", sentence1)
Validating emails with regex
Modelingsent = 'My email is jkapusta@ukf.sk and my colleague has mdrlik@ukf.sk . This is the bad email: jkkkapusta@u.k'
[w for w in sent.split(" ") if re.search("[a-z]+@[a-z.]+.[a-z]{2,3}$",w)]
Validating emails with regex
Evaluation[w for w in sent.split('"') if re.search("^[a-zA-Z0-9+-_.]{1,64}@[a-zA-Z0-9-]{1,255}\.[a-zA-Z0-9-.]{2,}$", w)]
Tokenizing text into words
Data preparationarray = word_tokenize(text)
Normalizing text to lowercase
Data preparationsmalym = text.lower()
Tokenizing normalized text
Data preparationword_tokenize(smalym)
Creating word frequency distribution
Evaluationv = Counter(word_tokenize(smalym))
Getting top 5 frequent words
Evaluationv.most_common(5)
Extracting emails from university website
Modelinglink = "http://www.tu.ff.ukf.sk/kontakty"
stranka = requests.get(link)
sent = stranka.text