update timetable

aa3172 · Jun 4, 2023 · 41104b0f7934d1ccde16edb1c5a6c74a186ae8f0 · 41104b0
1 parent a64713e
commit 41104b0f7934d1ccde16edb1c5a6c74a186ae8f0
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 110 deletions.
diff --git a/figures/walking-robot-1.png b/figures/walking-robot-1.png
diff --git a/figures/walking-robot.pdf b/figures/walking-robot.pdf
diff --git a/intro.org b/intro.org
@@ -3,112 +3,6 @@
 #+author: Prof. James Brusey
 #+options: toc:nil h:2
 #+startup: beamer
-* not exported                                                     :noexport:
-#+tblname: times
-| Slot |      Timings |
-|------+--------------|
-|    1 | 09:00--10:30 |
-|    2 | 10:45--12:00 |
-|    3 | 13:00--14:30 |
-|    4 | 14:45--16:00 |
-
-#+tblname: timetable
-| Day | Slot | Row | Title                      |
-|-----+------+-----+----------------------------|
-|   1 |    1 |   1 | Introduction to the course |
-|   1 |    2 |   2 | Intro to RL                |
-|   1 |    3 |   3 | OpenAI Gym, Gymnasium      |
-|   1 |    4 |   4 | Lab: Frozen-Lake play      |
-|   2 |    1 |   5 | MDPs                       |
-|   2 |    2 |   6 | Dynamic Programming        |
-|   2 |    3 |   7 | Lab: Solving Frozen-Lake   |
-|   2 |    4 |   8 | Monte Carlo Methods        |
-|   3 |    1 |   9 | Lab: Blackjack with MC     |
-|   3 |    2 |  10 | Function Approximation     |
-|   3 |    3 |  11 | DQN, SAC, PPO              |
-|   3 |    4 |  12 | Lab: Breakout              |
-|   4 |    1 |  13 | Lab: Demo day part 1       |
-|   4 |    2 |  14 | Lab: Demo day part 2       |
-|   4 |    3 |  15 | Demos                      |
-|   4 |    4 |  16 | Wrap-up                    |
-#+tblfm: $3=(@#-1)::$2=(($3-1) % 4)+1::$1=($3 - 1) \ 4 + 1
-
-#+BEGIN_SRC ipython :var times=times timetable=timetable :results output raw drawer
-import tabulate
-import pandas as pd
-
-times = pd.DataFrame(times, columns=["Slot", "Timings"])
-timetable = pd.DataFrame(timetable, columns=["Day", "Slot", "Row", "Title"])
-
-
-#timetable = timetable.assign(Title=lambda x: x.apply(fmt_i, axis=1))
-
-timetable = timetable.drop(columns=['Row'])
-timetable = timetable.join(times.set_index("Slot"), on="Slot")
-timetable = timetable.pivot(index="Timings", columns="Day", values="Title")
-
-timetable.columns = [f"Day {d}" for d in timetable.columns]
-
-print("* Timetable")
-print("** All days")
-print(r"#+attr_latex: :align lp{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm} :width 0.9\columnwidth :font \small")
-
-print(tabulate.tabulate(
-#    timetable.set_index(["Day", "Slot"]).unstack(),
-    timetable,
-    tablefmt="orgtbl", showindex=True, headers="keys"))
-
-for i in range(1, 5):
-    print(f"** Day {i}")
-    print(tabulate.tabulate(
-        timetable[[f'Day {i}']],
-        tablefmt="orgtbl", showindex=True, headers="keys"))
-
-
-#+END_SRC
-
-#+RESULTS:
-:results:
-* Timetable
-** All days
-#+attr_latex: :align lp{1.7cm}p{1.7cm}p{1.7cm}p{1.7cm} :width 0.9\columnwidth :font \small
-| Timings      | Day 1                      | Day 2                    | Day 3                  | Day 4                |
-|--------------+----------------------------+--------------------------+------------------------+----------------------|
-| 09:00--10:30 | Introduction to the course | MDPs                     | Lab: Blackjack with MC | Lab: Demo day part 1 |
-| 10:45--12:00 | Intro to RL                | Dynamic Programming      | Function Approximation | Lab: Demo day part 2 |
-| 13:00--14:30 | OpenAI Gym, Gymnasium      | Lab: Solving Frozen-Lake | DQN, SAC, PPO          | Demos                |
-| 14:45--16:00 | Lab: Frozen-Lake play      | Monte Carlo Methods      | Lab: Breakout          | Wrap-up              |
-** Day 1
-| Timings      | Day 1                      |
-|--------------+----------------------------|
-| 09:00--10:30 | Introduction to the course |
-| 10:45--12:00 | Intro to RL                |
-| 13:00--14:30 | OpenAI Gym, Gymnasium      |
-| 14:45--16:00 | Lab: Frozen-Lake play      |
-** Day 2
-| Timings      | Day 2                    |
-|--------------+--------------------------|
-| 09:00--10:30 | MDPs                     |
-| 10:45--12:00 | Dynamic Programming      |
-| 13:00--14:30 | Lab: Solving Frozen-Lake |
-| 14:45--16:00 | Monte Carlo Methods      |
-** Day 3
-| Timings      | Day 3                  |
-|--------------+------------------------|
-| 09:00--10:30 | Lab: Blackjack with MC |
-| 10:45--12:00 | Function Approximation |
-| 13:00--14:30 | DQN, SAC, PPO          |
-| 14:45--16:00 | Lab: Breakout          |
-** Day 4
-| Timings      | Day 4                |
-|--------------+----------------------|
-| 09:00--10:30 | Lab: Demo day part 1 |
-| 10:45--12:00 | Lab: Demo day part 2 |
-| 13:00--14:30 | Demos                |
-| 14:45--16:00 | Wrap-up              |
-:end:
-:results:
-
 * Admin
 ** Course description
 - This course aims to introduce the basic concepts of Reinforcement Learning and how they can be applied to real-world systems.
@@ -122,7 +16,6 @@ At the end of this course, you should be able to:
 1. Understand how to formulate an MDP problem
 2. Implement an existing RL algorithm in Python
 3. Evaluate the performance of an application of RL
-4. Apply RL in a robotic system involving actuators and sensors
 ** Introduction your tutor
 *** col1
 :PROPERTIES:
@@ -143,3 +36,5 @@ Professor James Brusey
 [[file:figures/james_brusey_photo_1w1.2h.png]]
 
 
+** Website
+https://sites.google.com/coventry.ac.uk/practicalreinforcementlearning/home
diff --git a/mdp.org b/mdp.org
@@ -8,6 +8,34 @@
 ** Agent-Environment Interface
 [[file:figures/agent-env.pdf]]
 
+$$
+S_0, A_0, R_1, S_1, A_1, R_2, S_2, A_2, R_3, \ldots
+$$
+** Some terms
++ Each MDP comprises the tuple $\langle \mathcal{S}, \mathcal{A}, \mathcal{R}, \mathcal{P}, \gamma \rangle$
++ $\mathcal{S}$ is the set of states
++ $\mathcal{A}$ is the set of actions
++ $\mathcal{R}$ is the set of rewards
++ $\mathcal{P}$ is the transition model
++ $\gamma$ is a discount factor
+** What is a state?
++ You might represent a state with a single number or a vector of numbers
++ There are a finite number and so they can be enumerated
++ State is a /compact/ representation of all history
++ If you could do better knowing the history, then the state does not have the /Markov property/
++ We will later come to infinite or continuous states
+** What is an action?
++ There are finite actions (infinite or continuous actions later)
++ At each time step, some action must be taken but that can be a no-op
++ The effect of the action is determined by the transition model
+
+** What is a transition model?
++ A transition model is a (stochastic) mapping between states, actions,  subsequent states, and rewards,
+  $$
+  p(s', r | s, a)
+  $$
++ It represents how the environment "works"
+
 * Goals and Rewards
 ** The reward hypothesis
 
@@ -16,7 +44,6 @@ That all of what we mean by goals and purposes can be well thought of as a maxim
 ---Michael Littman (S&B)
 #+end_quote
 
-
 * Rewards and Episodes
 
 ** Long term reward and Episodes
@@ -28,16 +55,56 @@ ending in the final reward at time $T$.
 
 An *episode* is everything up to a final time step $T$.
 
-If the 
+Note that if $T$ were infinite, we would have the potential for infinite long-term reward. 
+
 
 ** Discounted reward
 It is often natural to think of a gain in some distant future as being not so valuable as a gain right now.
+$$
+G_t \doteq R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots 
+$$
+for $\gamma<1$.
 
-We can represent this by inclu
+$$
+G_t \doteq \sum_{0\leq k < \infty} \gamma^k R_{t + k + 1}
+$$
 
+Note that it is now not necessary to place a finite limit on the episode length. 
 
 * Unified notation for episodic and continuing tasks
+** Unified notation for episodic and continuing tasks
+
++ We can deal with the difference between episodic and continuing tasks using the concept of /absorbing/ states
++ Absorbing states yield zero reward and always transition to the same state
+
+$$
+G_t \doteq \sum_{t+1\leq k \leq T} \gamma^{k-t-1} R_{k}
+$$
+
+where $T=\infty$ or $\gamma = 1$ (but not both). 
 
 * Policies and value functions
+** Policies
++ A policy represents how to act in each possible state
+
++ Policies are a distribution over actions
+  $$
+  \pi(a | s) \rightarrow [0, 1]
+  $$
+  For all $s\in \mathcal{S}$,
+  $$
+  \sum_a \pi(a | s)   = 1
+  $$
 
+** Value functions
++ A state value function $v_\pi(s, a)$ is the long term value of being in state $s$ assuming that you follow policy $\pi$
+  $$
+  v_pi(s) \doteq \mathbb{E}_\pi [G_t | S_t = s]
+  $$
++ A state action value function $q_\pi(s,a)$ is the long term value of being in state $s$, taking action $a$, and then following $\pi$ from then on.
+  $$
+  q_pi(s,a) \doteq \mathbb{E}_\pi [G_t | S_t = s, A_t=a]
+  $$
+
 * Optimal policies and optimal value functions
+** Optimal policies and value functions
diff --git a/timetable.org b/timetable.org
@@ -0,0 +1,105 @@
+* not exported                                                     :noexport:
+
+#+tblname: daydate
+| Day | Date      |
+|-----+-----------|
+|   1 | 05/6/2023 |
+|   2 | 06/6/2023 |
+|   3 | 07/6/2023 |
+|   4 | 09/6/2023 |
+
+#+tblname: times
+| Slot |      Timings |
+|------+--------------|
+|    1 | 10:00--11:15 |
+|    2 | 11:30--13:00 |
+|    3 | 14:00--15:15 |
+|    4 | 15:30--17:00 |
+
+#+tblname: timetable
+| Day | Slot | Row | Title                      |
+|-----+------+-----+----------------------------|
+|   1 |    1 |   1 | Introduction to the course |
+|   1 |    2 |   2 | Intro to RL                |
+|   1 |    3 |   3 | OpenAI Gym, Gymnasium      |
+|   1 |    4 |   4 | Lab: Frozen-Lake play      |
+|   2 |    1 |   5 | MDPs                       |
+|   2 |    2 |   6 | Dynamic Programming        |
+|   2 |    3 |   7 | Lab: Solving Frozen-Lake   |
+|   2 |    4 |   8 | Monte Carlo Methods        |
+|   3 |    1 |   9 | Lab: Blackjack with MC     |
+|   3 |    2 |  10 | Function Approximation     |
+|   3 |    3 |  11 | DQN, SAC, PPO              |
+|   3 |    4 |  12 | Lab: Breakout              |
+|   4 |    1 |  13 | Lab: Demo day              |
+|   4 |    2 |  14 | Demos and wrap up          |
+#+tblfm: $3=(@#-1)::$2=(($3-1) % 4)+1::$1=($3 - 1) \ 4 + 1
+#+BEGIN_SRC ipython :session ical :var times=times timetable=timetable daydate=daydate
+# ensure you set pyvenv-workon to ical first
+import pandas as pd
+from icalendar import Calendar, Event
+import datetime
+import re
+mre = re.compile(r"([0-9]*:[0-9]*)--([0-9]*:[0-9]*)")
+timeformat = "%H:%M"
+
+def convert_time_range(s):
+    m = mre.match(s)
+    assert m
+    return (datetime.datetime.strptime(m[1], timeformat).time(),
+            datetime.datetime.strptime(m[2], timeformat).time())
+
+slots = (pd.DataFrame(timetable, columns=["day", "slot", "row", "title"])
+         .assign(day=lambda x: x.day.astype(int))
+         )
+
+times = (pd.DataFrame(times, columns=["slot", "times"])
+         .assign(slot=lambda x: x.slot.astype(int))
+         .assign(time_range=lambda x: x.times.apply(convert_time_range))
+         .drop(['times'], axis=1)
+         .set_index("slot")
+         )
+
+daydate = (pd.DataFrame(daydate, columns=["day", "date"])
+           .assign(day=lambda x: x.day.astype(int),
+                   date=lambda x: x.date.apply(lambda y: datetime.datetime.strptime(y, "%d/%m/%Y")))
+           .set_index("day")
+           )
+
+
+
+slots = (slots.join(times, on='slot')
+ .join(daydate, on="day")
+)
+
+cal = Calendar()
+cal.add('prodid', '-//My calendar product//mxm.dk//')
+cal.add('version', '2.0')
+
+for ix, row in slots.iterrows():
+
+    tz = datetime.timezone(datetime.timedelta(hours=5))
+    dstart = datetime.datetime.combine(row.date, row.time_range[0], tzinfo=tz)
+    dend = datetime.datetime.combine(row.date, row.time_range[1], tzinfo=tz)
+
+    event = Event()
+    event.add('summary', row.title)
+    event.add('location', "")
+    event.add('dtstart', dstart)
+    event.add('dtend', dend)
+    event.add('dtstamp', datetime.datetime.now())
+    cal.add_component(event)
+
+
+with open("rl-course.ics", "wb") as f:
+    f.write(cal.to_ical())
+
+from os import system    
+
+system("rsync rl-course.ics cogentee:public_html/rl-course.ics")
+
+#+END_SRC
+
+#+RESULTS:
+: # Out[12]:
+: : 0