Additional operations and proofs about IEEE-754 binary
floating-point numbers, on top of the Flocq library.
Require Import Psatz.
Require Import Bool.
Require Import Eqdep_dec.
Require Import Fcore.
Require Import Fcore_digits.
Require Import Fcalc_digits.
Require Import Fcalc_ops.
Require Import Fcalc_round.
Require Import Fcalc_bracket.
Require Import Fprop_Sterbenz.
Require Import Fappli_IEEE.
Require Import Fappli_rnd_odd.
Local Open Scope Z_scope.
Section Extra_ops.
prec is the number of bits of the mantissa including the implicit one.
emax is the exponent of the infinities.
Typically p=24 and emax = 128 in single precision.
Variable prec emax :
Z.
Context (
prec_gt_0_ :
Prec_gt_0 prec).
Let emin := (3 -
emax -
prec)%
Z.
Let fexp :=
FLT_exp emin prec.
Hypothesis Hmax : (
prec <
emax)%
Z.
Let binary_float :=
binary_float prec emax.
Remarks on is_finite
Remark is_finite_not_is_nan:
forall (
f:
binary_float),
is_finite _ _ f =
true ->
is_nan _ _ f =
false.
Proof.
destruct f; reflexivity || discriminate.
Qed.
Remark is_finite_strict_finite:
forall (
f:
binary_float),
is_finite_strict _ _ f =
true ->
is_finite _ _ f =
true.
Proof.
destruct f; reflexivity || discriminate.
Qed.
Digression on FP numbers that cannot be -0.0.
Definition is_finite_pos0 (
f:
binary_float) :
bool :=
match f with
|
B754_zero _ _ s =>
negb s
|
B754_infinity _ _ _ =>
false
|
B754_nan _ _ _ _ =>
false
|
B754_finite _ _ _ _ _ _ =>
true
end.
Lemma Bsign_pos0:
forall x,
is_finite_pos0 x =
true ->
Bsign _ _ x =
Rlt_bool (
B2R _ _ x) 0%
R.
Proof.
Theorem B2R_inj_pos0:
forall x y,
is_finite_pos0 x =
true ->
is_finite_pos0 y =
true ->
B2R _ _ x =
B2R _ _ y ->
x =
y.
Proof.
intros.
apply B2R_Bsign_inj.
destruct x;
reflexivity||
discriminate.
destruct y;
reflexivity||
discriminate.
auto.
rewrite !
Bsign_pos0 by auto.
rewrite H1;
auto.
Qed.
Decidable equality
Definition Beq_dec:
forall (
f1 f2:
binary_float), {
f1 =
f2} + {
f1 <>
f2}.
Proof.
assert (
UIP_bool:
forall (
b1 b2:
bool) (
e e':
b1 =
b2),
e =
e').
{
intros.
apply UIP_dec.
decide equality. }
Ltac try_not_eq :=
try solve [
right;
congruence].
destruct f1 as [| |? []|],
f2 as [| |? []|];
try destruct b;
try destruct b0;
try solve [
left;
auto];
try_not_eq.
destruct (
Pos.eq_dec x x0);
try_not_eq;
subst;
left;
f_equal;
f_equal;
apply UIP_bool.
destruct (
Pos.eq_dec x x0);
try_not_eq;
subst;
left;
f_equal;
f_equal;
apply UIP_bool.
destruct (
Pos.eq_dec m m0);
try_not_eq;
destruct (
Z.eq_dec e e1);
try solve [
right;
intro H;
inversion H;
congruence];
subst;
left;
f_equal;
apply UIP_bool.
destruct (
Pos.eq_dec m m0);
try_not_eq;
destruct (
Z.eq_dec e e1);
try solve [
right;
intro H;
inversion H;
congruence];
subst;
left;
f_equal;
apply UIP_bool.
Defined.
Conversion from an integer to a FP number
Integers that can be represented exactly as FP numbers.
Definition integer_representable (
n:
Z):
Prop :=
Z.abs n <= 2^
emax - 2^(
emax -
prec) /\
generic_format radix2 fexp (
Z2R n).
Let int_upper_bound_eq: 2^
emax - 2^(
emax -
prec) = (2^
prec - 1) * 2^(
emax -
prec).
Proof.
red in prec_gt_0_.
ring_simplify.
rewrite <- (
Zpower_plus radix2)
by omega.
f_equal.
f_equal.
omega.
Qed.
Lemma integer_representable_n2p:
forall n p,
-2^
prec <
n < 2^
prec -> 0 <=
p ->
p <=
emax -
prec ->
integer_representable (
n * 2^
p).
Proof.
Lemma integer_representable_2p:
forall p,
0 <=
p <=
emax - 1 ->
integer_representable (2^
p).
Proof.
Lemma integer_representable_opp:
forall n,
integer_representable n ->
integer_representable (-
n).
Proof.
Lemma integer_representable_n2p_wide:
forall n p,
-2^
prec <=
n <= 2^
prec -> 0 <=
p ->
p <
emax -
prec ->
integer_representable (
n * 2^
p).
Proof.
Lemma integer_representable_n:
forall n, -2^
prec <=
n <= 2^
prec ->
integer_representable n.
Proof.
Lemma round_int_no_overflow:
forall n,
Z.abs n <= 2^
emax - 2^(
emax-
prec) ->
(
Rabs (
round radix2 fexp (
round_mode mode_NE) (
Z2R n)) <
bpow radix2 emax)%
R.
Proof.
Conversion from an integer. Round to nearest.
Definition BofZ (
n:
Z) :
binary_float :=
binary_normalize prec emax prec_gt_0_ Hmax mode_NE n 0
false.
Theorem BofZ_correct:
forall n,
if Rlt_bool (
Rabs (
round radix2 fexp (
round_mode mode_NE) (
Z2R n))) (
bpow radix2 emax)
then
B2R prec emax (
BofZ n) =
round radix2 fexp (
round_mode mode_NE) (
Z2R n) /\
is_finite _ _ (
BofZ n) =
true /\
Bsign prec emax (
BofZ n) =
Z.ltb n 0
else
B2FF prec emax (
BofZ n) =
binary_overflow prec emax mode_NE (
Z.ltb n 0).
Proof.
Theorem BofZ_finite:
forall n,
Z.abs n <= 2^
emax - 2^(
emax-
prec) ->
B2R _ _ (
BofZ n) =
round radix2 fexp (
round_mode mode_NE) (
Z2R n)
/\
is_finite _ _ (
BofZ n) =
true
/\
Bsign _ _ (
BofZ n) =
Z.ltb n 0%
Z.
Proof.
Theorem BofZ_representable:
forall n,
integer_representable n ->
B2R _ _ (
BofZ n) =
Z2R n
/\
is_finite _ _ (
BofZ n) =
true
/\
Bsign _ _ (
BofZ n) = (
n <? 0).
Proof.
Theorem BofZ_exact:
forall n,
-2^
prec <=
n <= 2^
prec ->
B2R _ _ (
BofZ n) =
Z2R n
/\
is_finite _ _ (
BofZ n) =
true
/\
Bsign _ _ (
BofZ n) =
Z.ltb n 0%
Z.
Proof.
Lemma BofZ_finite_pos0:
forall n,
Z.abs n <= 2^
emax - 2^(
emax-
prec) ->
is_finite_pos0 (
BofZ n) =
true.
Proof.
Lemma BofZ_finite_equal:
forall x y,
Z.abs x <= 2^
emax - 2^(
emax-
prec) ->
Z.abs y <= 2^
emax - 2^(
emax-
prec) ->
B2R _ _ (
BofZ x) =
B2R _ _ (
BofZ y) ->
BofZ x =
BofZ y.
Proof.
Commutation properties with addition, subtraction, multiplication.
Theorem BofZ_plus:
forall nan p q,
integer_representable p ->
integer_representable q ->
Bplus _ _ _ Hmax nan mode_NE (
BofZ p) (
BofZ q) =
BofZ (
p +
q).
Proof.
Theorem BofZ_minus:
forall nan p q,
integer_representable p ->
integer_representable q ->
Bminus _ _ _ Hmax nan mode_NE (
BofZ p) (
BofZ q) =
BofZ (
p -
q).
Proof.
Theorem BofZ_mult:
forall nan p q,
integer_representable p ->
integer_representable q ->
0 <
q ->
Bmult _ _ _ Hmax nan mode_NE (
BofZ p) (
BofZ q) =
BofZ (
p *
q).
Proof.
Theorem BofZ_mult_2p:
forall nan x p,
Z.abs x <= 2^
emax - 2^(
emax-
prec) ->
2^
prec <=
Z.abs x ->
0 <=
p <=
emax - 1 ->
Bmult _ _ _ Hmax nan mode_NE (
BofZ x) (
BofZ (2^
p)) =
BofZ (
x * 2^
p).
Proof.
intros.
destruct (
Z.eq_dec x 0).
-
subst x.
apply BofZ_mult.
apply integer_representable_n.
generalize (
Zpower_ge_0 radix2 prec).
simpl;
omega.
apply integer_representable_2p.
auto.
apply (
Zpower_gt_0 radix2).
omega.
-
assert (
Z2R x <> 0%
R)
by (
apply (
Z2R_neq _ _ n)).
destruct (
BofZ_finite x H)
as (
A &
B &
C).
destruct (
BofZ_representable (2^
p))
as (
D &
E &
F).
apply integer_representable_2p.
auto.
assert (
canonic_exp radix2 fexp (
Z2R (
x * 2^
p)) =
canonic_exp radix2 fexp (
Z2R x) +
p).
{
unfold canonic_exp,
fexp.
rewrite Z2R_mult.
change (2^
p)
with (
radix2^
p).
rewrite Z2R_Zpower by omega.
rewrite ln_beta_mult_bpow by auto.
assert (
prec + 1 <=
ln_beta radix2 (
Z2R x)).
{
rewrite <- (
ln_beta_abs radix2 (
Z2R x)).
rewrite <- (
ln_beta_bpow radix2 prec).
apply ln_beta_le.
apply bpow_gt_0.
rewrite <-
Z2R_Zpower by (
red in prec_gt_0_;
omega).
rewrite <-
Z2R_abs.
apply Z2R_le;
auto. }
unfold FLT_exp.
unfold emin;
red in prec_gt_0_;
zify;
omega.
}
assert (
forall m,
round radix2 fexp m (
Z2R x) *
Z2R (2^
p) =
round radix2 fexp m (
Z2R (
x * 2^
p)))%
R.
{
intros.
unfold round,
scaled_mantissa.
rewrite H3.
rewrite Z2R_mult.
rewrite Z.opp_add_distr.
rewrite bpow_plus.
set (
a :=
Z2R x);
set (
b :=
bpow radix2 (-
canonic_exp radix2 fexp a)).
replace (
a *
Z2R (2^
p) * (
b *
bpow radix2 (-
p)))%
R with (
a *
b)%
R.
unfold F2R;
simpl.
rewrite Rmult_assoc.
f_equal.
rewrite bpow_plus.
f_equal.
apply (
Z2R_Zpower radix2).
omega.
transitivity ((
a *
b) * (
Z2R (2^
p) *
bpow radix2 (-
p)))%
R.
rewrite (
Z2R_Zpower radix2).
rewrite <-
bpow_plus.
replace (
p + -
p)
with 0
by omega.
change (
bpow radix2 0)
with 1%
R.
ring.
omega.
ring.
}
assert (
forall m x,
round radix2 fexp (
round_mode m) (
round radix2 fexp (
round_mode m)
x) =
round radix2 fexp (
round_mode m)
x).
{
intros.
apply round_generic.
apply valid_rnd_round_mode.
apply generic_format_round.
apply fexp_correct;
auto.
apply valid_rnd_round_mode.
}
assert (
xorb (
x <? 0) (2^
p <? 0) = (
x * 2^
p <? 0)).
{
assert (0 < 2^
p)
by (
apply (
Zpower_gt_0 radix2);
omega).
rewrite (
Zlt_bool_false (2^
p))
by omega.
rewrite xorb_false_r.
symmetry.
generalize (
Zlt_bool_spec x 0);
intros SPEC;
inversion SPEC.
apply Zlt_bool_true.
apply Z.mul_neg_pos;
auto.
apply Zlt_bool_false.
apply Z.mul_nonneg_nonneg;
omega.
}
generalize (
Bmult_correct _ _ _ Hmax nan mode_NE (
BofZ x) (
BofZ (2^
p)))
(
BofZ_correct (
x * 2^
p)).
fold emin;
fold fexp.
rewrite A,
B,
C,
D,
E,
F,
H4,
H5.
destruct Rlt_bool.
+
intros (
P &
Q &
R) (
U &
V &
W).
apply B2R_Bsign_inj;
auto.
rewrite P,
U.
auto.
rewrite R,
W.
auto.
apply is_finite_not_is_nan;
auto.
+
intros P U.
apply B2FF_inj.
rewrite P,
U.
f_equal;
auto.
Qed.
Rounding to odd the argument of BofZ.
Lemma round_odd_flt:
forall prec'
emin'
x choice,
prec > 1 ->
prec' > 1 ->
prec' >=
prec + 2 ->
emin' <=
emin - 2 ->
round radix2 fexp (
Znearest choice) (
round radix2 (
FLT_exp emin'
prec')
Zrnd_odd x) =
round radix2 fexp (
Znearest choice)
x.
Proof.
Corollary round_odd_fix:
forall x p choice,
prec > 1 ->
0 <=
p ->
(
bpow radix2 (
prec +
p + 1) <=
Rabs x)%
R ->
round radix2 fexp (
Znearest choice) (
round radix2 (
FIX_exp p)
Zrnd_odd x) =
round radix2 fexp (
Znearest choice)
x.
Proof.
Definition int_round_odd (
x:
Z) (
p:
Z) :=
(
if Z.eqb (
x mod 2^
p) 0 ||
Z.odd (
x / 2^
p)
then x / 2^
p else x / 2^
p + 1) * 2^
p.
Lemma Zrnd_odd_int:
forall n p, 0 <=
p ->
Zrnd_odd (
Z2R n *
bpow radix2 (-
p)) * 2^
p =
int_round_odd n p.
Proof.
Lemma int_round_odd_le:
forall p x y, 0 <=
p ->
x <=
y ->
int_round_odd x p <=
int_round_odd y p.
Proof.
Lemma int_round_odd_exact:
forall p x, 0 <=
p ->
(2^
p |
x) ->
int_round_odd x p =
x.
Proof.
Theorem BofZ_round_odd:
forall x p,
prec > 1 ->
Z.abs x <= 2^
emax - 2^(
emax-
prec) ->
0 <=
p <=
emax -
prec ->
2^(
prec +
p + 1) <=
Z.abs x ->
BofZ x =
BofZ (
int_round_odd x p).
Proof.
Lemma int_round_odd_shifts:
forall x p, 0 <=
p ->
int_round_odd x p =
Z.shiftl (
if Z.eqb (
x mod 2^
p) 0
then Z.shiftr x p else Z.lor (
Z.shiftr x p) 1)
p.
Proof.
intros.
unfold int_round_odd.
rewrite Z.shiftl_mul_pow2 by auto.
f_equal.
rewrite Z.shiftr_div_pow2 by auto.
destruct (
x mod 2^
p =? 0)
eqn:
E.
auto.
assert (
forall n, (
if Z.odd n then n else n + 1) =
Z.lor n 1).
{
destruct n;
simpl;
auto.
destruct p0;
auto.
destruct p0;
auto.
induction p0;
auto. }
simpl.
apply H0.
Qed.
Lemma int_round_odd_bits:
forall x y p, 0 <=
p ->
(
forall i, 0 <=
i <
p ->
Z.testbit y i =
false) ->
Z.testbit y p = (
if Z.eqb (
x mod 2^
p) 0
then Z.testbit x p else true) ->
(
forall i,
p <
i ->
Z.testbit y i =
Z.testbit x i) ->
int_round_odd x p =
y.
Proof.
Conversion from a FP number to an integer
Always rounds toward zero.
Definition ZofB (
f:
binary_float):
option Z :=
match f with
|
B754_finite _ _ s m (
Zpos e)
_ =>
Some (
cond_Zopp s (
Zpos m) *
Z.pow_pos radix2 e)%
Z
|
B754_finite _ _ s m 0
_ =>
Some (
cond_Zopp s (
Zpos m))
|
B754_finite _ _ s m (
Zneg e)
_ =>
Some (
cond_Zopp s (
Zpos m /
Z.pow_pos radix2 e))%
Z
|
B754_zero _ _ _ =>
Some 0%
Z
|
_ =>
None
end.
Theorem ZofB_correct:
forall f,
ZofB f =
if is_finite _ _ f then Some (
Ztrunc (
B2R _ _ f))
else None.
Proof.
Interval properties.
Remark Ztrunc_range_pos:
forall x, 0 <
Ztrunc x -> (
Z2R (
Ztrunc x) <=
x <
Z2R (
Ztrunc x + 1)%
Z)%
R.
Proof.
Remark Ztrunc_range_zero:
forall x,
Ztrunc x = 0 -> (-1 <
x < 1)%
R.
Proof.
Theorem ZofB_range_pos:
forall f n,
ZofB f =
Some n -> 0 <
n -> (
Z2R n <=
B2R _ _ f <
Z2R (
n + 1)%
Z)%
R.
Proof.
Theorem ZofB_range_neg:
forall f n,
ZofB f =
Some n ->
n < 0 -> (
Z2R (
n - 1)%
Z <
B2R _ _ f <=
Z2R n)%
R.
Proof.
Theorem ZofB_range_zero:
forall f,
ZofB f =
Some 0 -> (-1 <
B2R _ _ f < 1)%
R.
Proof.
Theorem ZofB_range_nonneg:
forall f n,
ZofB f =
Some n -> 0 <=
n -> (-1 <
B2R _ _ f <
Z2R (
n + 1)%
Z)%
R.
Proof.
For representable integers, ZofB is left inverse of BofZ.
Theorem ZofBofZ_exact:
forall n,
integer_representable n ->
ZofB (
BofZ n) =
Some n.
Proof.
Compatibility with subtraction
Remark Zfloor_minus:
forall x n,
Zfloor (
x -
Z2R n) =
Zfloor x -
n.
Proof.
Theorem ZofB_minus:
forall minus_nan m f p q,
ZofB f =
Some p -> 0 <=
p < 2*
q ->
q <= 2^
prec -> (
Z2R q <=
B2R _ _ f)%
R ->
ZofB (
Bminus _ _ _ Hmax minus_nan m f (
BofZ q)) =
Some (
p -
q).
Proof.
A variant of ZofB that bounds the range of representable integers.
Definition ZofB_range (
f:
binary_float) (
zmin zmax:
Z):
option Z :=
match ZofB f with
|
None =>
None
|
Some z =>
if Z.leb zmin z &&
Z.leb z zmax then Some z else None
end.
Theorem ZofB_range_correct:
forall f min max,
let n :=
Ztrunc (
B2R _ _ f)
in
ZofB_range f min max =
if is_finite _ _ f &&
Z.leb min n &&
Z.leb n max then Some n else None.
Proof.
Lemma ZofB_range_inversion:
forall f min max n,
ZofB_range f min max =
Some n ->
min <=
n /\
n <=
max /\
ZofB f =
Some n.
Proof.
Theorem ZofB_range_minus:
forall minus_nan m f p q,
ZofB_range f 0 (2 *
q - 1) =
Some p ->
q <= 2^
prec -> (
Z2R q <=
B2R _ _ f)%
R ->
ZofB_range (
Bminus _ _ _ Hmax minus_nan m f (
BofZ q)) (-
q) (
q - 1) =
Some (
p -
q).
Proof.
Algebraic identities
Commutativity of addition and multiplication
Theorem Bplus_commut:
forall plus_nan mode (
x y:
binary_float),
plus_nan x y =
plus_nan y x ->
Bplus _ _ _ Hmax plus_nan mode x y =
Bplus _ _ _ Hmax plus_nan mode y x.
Proof.
intros until y;
intros NAN.
pose proof (
Bplus_correct _ _ _ Hmax plus_nan mode x y).
pose proof (
Bplus_correct _ _ _ Hmax plus_nan mode y x).
unfold Bplus in *;
destruct x;
destruct y;
auto.
-
rewrite (
eqb_sym b0 b).
destruct (
eqb b b0)
eqn:
EQB;
auto.
f_equal;
apply eqb_prop;
auto.
-
rewrite NAN;
auto.
-
rewrite (
eqb_sym b0 b).
destruct (
eqb b b0)
eqn:
EQB.
f_equal;
apply eqb_prop;
auto.
rewrite NAN;
auto.
-
rewrite NAN;
auto.
-
rewrite NAN;
auto.
-
rewrite NAN;
auto.
-
rewrite NAN;
auto.
-
rewrite NAN;
auto.
-
rewrite NAN;
auto.
-
generalize (
H (
eq_refl _) (
eq_refl _));
clear H.
generalize (
H0 (
eq_refl _) (
eq_refl _));
clear H0.
fold emin.
fold fexp.
set (
x :=
B754_finite prec emax b0 m0 e1 e2).
set (
rx :=
B2R _ _ x).
set (
y :=
B754_finite prec emax b m e e0).
set (
ry :=
B2R _ _ y).
rewrite (
Rplus_comm ry rx).
destruct Rlt_bool.
+
intros (
A1 &
A2 &
A3) (
B1 &
B2 &
B3).
apply B2R_Bsign_inj;
auto.
rewrite <-
B1 in A1.
auto.
rewrite Z.add_comm.
rewrite Z.min_comm.
auto.
+
intros (
A1 &
A2) (
B1 &
B2).
apply B2FF_inj.
rewrite B2 in B1.
rewrite <-
B1 in A1.
auto.
Qed.
Theorem Bmult_commut:
forall mult_nan mode (
x y:
binary_float),
mult_nan x y =
mult_nan y x ->
Bmult _ _ _ Hmax mult_nan mode x y =
Bmult _ _ _ Hmax mult_nan mode y x.
Proof.
Multiplication by 2 is diagonal addition.
Theorem Bmult2_Bplus:
forall plus_nan mult_nan mode (
f:
binary_float),
(
forall (
x y:
binary_float),
is_nan _ _ x =
true ->
is_finite _ _ y =
true ->
plus_nan x x =
mult_nan x y) ->
Bplus _ _ _ Hmax plus_nan mode f f =
Bmult _ _ _ Hmax mult_nan mode f (
BofZ 2%
Z).
Proof.
Divisions that can be turned into multiplications by an inverse
Definition Bexact_inverse_mantissa :=
Z.iter (
prec - 1)
xO xH.
Remark Bexact_inverse_mantissa_value:
Zpos Bexact_inverse_mantissa = 2 ^ (
prec - 1).
Proof.
Remark Bexact_inverse_mantissa_digits2_pos:
Z.pos (
digits2_pos Bexact_inverse_mantissa) =
prec.
Proof.
Remark bounded_Bexact_inverse:
forall e,
emin <=
e <=
emax -
prec <->
bounded prec emax Bexact_inverse_mantissa e =
true.
Proof.
Program Definition Bexact_inverse (
f:
binary_float) :
option binary_float :=
match f with
|
B754_finite _ _ s m e B =>
if Pos.eq_dec m Bexact_inverse_mantissa then
let e' := -
e - (
prec - 1) * 2
in
if Z_le_dec emin e'
then
if Z_le_dec e'
emax then
Some(
B754_finite _ _ s m e'
_)
else None else None else None
|
_ =>
None
end.
Next Obligation.
Lemma Bexact_inverse_correct:
forall f f',
Bexact_inverse f =
Some f' ->
is_finite_strict _ _ f =
true
/\
is_finite_strict _ _ f' =
true
/\
B2R _ _ f' = (/
B2R _ _ f)%
R
/\
B2R _ _ f <> 0%
R
/\
Bsign _ _ f' =
Bsign _ _ f.
Proof with
Theorem Bdiv_mult_inverse:
forall div_nan mult_nan mode x y z,
(
forall (
x y z:
binary_float),
is_nan _ _ x =
true ->
is_finite _ _ y =
true ->
is_finite _ _ z =
true ->
div_nan x y =
mult_nan x z) ->
Bexact_inverse y =
Some z ->
Bdiv _ _ _ Hmax div_nan mode x y =
Bmult _ _ _ Hmax mult_nan mode x z.
Proof.
Conversion from scientific notation
Russian peasant exponentiation
Fixpoint pos_pow (
x y:
positive) :
positive :=
match y with
|
xH =>
x
|
xO y =>
Pos.square (
pos_pow x y)
|
xI y =>
Pos.mul x (
Pos.square (
pos_pow x y))
end.
Lemma pos_pow_spec:
forall x y,
Z.pos (
pos_pow x y) =
Z.pos x ^
Z.pos y.
Proof.
Given a base base, a mantissa m and an exponent e, the following function
computes the FP number closest to m * base ^ e, using round to odd, ties break to even.
The algorithm is naive, computing base ^ |e| exactly before doing a multiplication or
division with m. However, we treat specially very large or very small values of e,
when the result is known to be +infinity or 0.0 respectively.
Definition Bparse (
base:
positive) (
m:
positive) (
e:
Z):
binary_float :=
match e with
|
Z0 =>
BofZ (
Zpos m)
|
Zpos p =>
if e *
Z.log2 (
Zpos base) <?
emax
then BofZ (
Zpos m *
Zpos (
pos_pow base p))
else B754_infinity _ _ false
|
Zneg p =>
if e *
Z.log2 (
Zpos base) +
Z.log2_up (
Zpos m) <?
emin
then B754_zero _ _ false
else FF2B prec emax _ (
proj1 (
Bdiv_correct_aux prec emax prec_gt_0_ Hmax mode_NE
false m Z0 false (
pos_pow base p)
Z0))
end.
Properties of Z.log2 and Z.log2_up.
Lemma Zpower_log:
forall (
base:
radix)
n,
0 <
n ->
2 ^ (
n *
Z.log2 base) <=
base ^
n <= 2 ^ (
n *
Z.log2_up base).
Proof.
Lemma bpow_log_pos:
forall (
base:
radix)
n,
0 <
n ->
(
bpow radix2 (
n *
Z.log2 base)%
Z <=
bpow base n)%
R.
Proof.
Lemma bpow_log_neg:
forall (
base:
radix)
n,
n < 0 ->
(
bpow base n <=
bpow radix2 (
n *
Z.log2 base)%
Z)%
R.
Proof.
Overflow and underflow conditions.
Lemma round_integer_overflow:
forall (
base:
radix)
e m,
0 <
e ->
emax <=
e *
Z.log2 base ->
(
bpow radix2 emax <=
round radix2 fexp (
round_mode mode_NE) (
Z2R (
Zpos m) *
bpow base e))%
R.
Proof.
Lemma round_NE_underflows:
forall x,
(0 <=
x <=
bpow radix2 (
emin - 1))%
R ->
round radix2 fexp (
round_mode mode_NE)
x = 0%
R.
Proof.
Lemma round_integer_underflow:
forall (
base:
radix)
e m,
e < 0 ->
e *
Z.log2 base +
Z.log2_up (
Zpos m) <
emin ->
round radix2 fexp (
round_mode mode_NE) (
Z2R (
Zpos m) *
bpow base e) = 0%
R.
Proof.
Correctness of Bparse
Theorem Bparse_correct:
forall b m e (
BASE: 2 <=
Zpos b),
let base := {|
radix_val :=
Zpos b;
radix_prop :=
Zle_imp_le_bool _ _ BASE |}
in
let r :=
round radix2 fexp (
round_mode mode_NE) (
Z2R (
Zpos m) *
bpow base e)
in
if Rlt_bool (
Rabs r) (
bpow radix2 emax)
then
B2R _ _ (
Bparse b m e) =
r
/\
is_finite _ _ (
Bparse b m e) =
true
/\
Bsign _ _ (
Bparse b m e) =
false
else
B2FF _ _ (
Bparse b m e) =
F754_infinity false.
Proof.
intros.
assert (
A:
forall x, @
F2R radix2 {|
Fnum :=
x;
Fexp := 0 |} =
Z2R x).
{
intros.
unfold F2R,
Fnum;
simpl.
ring. }
unfold Bparse,
r.
destruct e as [ |
e |
e].
-
change (
bpow base 0)
with 1%
R.
rewrite Rmult_1_r.
exact (
BofZ_correct (
Z.pos m)).
-
destruct (
Z.ltb_spec (
Z.pos e *
Z.log2 (
Z.pos b))
emax).
+
rewrite pos_pow_spec.
rewrite <-
Z2R_Zpower by (
zify;
omega).
rewrite <-
Z2R_mult.
replace false with (
Z.pos m *
Z.pos b ^
Z.pos e <? 0).
exact (
BofZ_correct (
Z.pos m *
Z.pos b ^
Z.pos e)).
rewrite Z.ltb_ge.
rewrite Z.mul_comm.
apply Zmult_gt_0_le_0_compat.
zify;
omega.
apply (
Zpower_ge_0 base).
+
rewrite Rlt_bool_false.
auto.
eapply Rle_trans; [
idtac|
apply Rle_abs].
apply (
round_integer_overflow base).
zify;
omega.
auto.
-
destruct (
Z.ltb_spec (
Z.neg e *
Z.log2 (
Z.pos b) +
Z.log2_up (
Z.pos m))
emin).
+
rewrite round_integer_underflow;
auto.
rewrite Rlt_bool_true.
auto.
replace (
Rabs 0)%
R with 0%
R.
apply bpow_gt_0.
apply (
Z2R_abs 0).
zify;
omega.
+
generalize (
Bdiv_correct_aux prec emax prec_gt_0_ Hmax mode_NE false m 0
false (
pos_pow b e) 0).
set (
f :=
match Fdiv_core_binary prec (
Z.pos m) 0 (
Z.pos (
pos_pow b e)) 0
with
| (0,
_,
_) =>
F754_nan false 1
| (
Z.pos mz0,
ez,
lz) =>
binary_round_aux prec emax mode_NE (
xorb false false)
mz0 ez lz
| (
Z.neg _,
_,
_) =>
F754_nan false 1
end).
fold emin;
fold fexp.
rewrite !
A.
unfold cond_Zopp.
rewrite pos_pow_spec.
assert (
B: (
Z2R (
Z.pos m) /
Z2R (
Z.pos b ^
Z.pos e) =
Z2R (
Z.pos m) *
bpow base (
Z.neg e))%
R).
{
change (
Z.neg e)
with (- (
Z.pos e)).
rewrite bpow_opp.
auto. }
rewrite B.
intros [
P Q].
destruct (
Rlt_bool
(
Rabs
(
round radix2 fexp (
round_mode mode_NE)
(
Z2R (
Z.pos m) *
bpow base (
Z.neg e))))
(
bpow radix2 emax)).
*
destruct Q as (
Q1 &
Q2 &
Q3).
split.
rewrite B2R_FF2B,
Q1.
auto.
split.
rewrite is_finite_FF2B.
auto.
rewrite Bsign_FF2B.
auto.
*
rewrite B2FF_FF2B.
auto.
Qed.
End Extra_ops.
Conversions between two FP formats
Section Conversions.
Variable prec1 emax1 prec2 emax2 :
Z.
Context (
prec1_gt_0_ :
Prec_gt_0 prec1) (
prec2_gt_0_ :
Prec_gt_0 prec2).
Let emin1 := (3 -
emax1 -
prec1)%
Z.
Let fexp1 :=
FLT_exp emin1 prec1.
Let emin2 := (3 -
emax2 -
prec2)%
Z.
Let fexp2 :=
FLT_exp emin2 prec2.
Hypothesis Hmax1 : (
prec1 <
emax1)%
Z.
Hypothesis Hmax2 : (
prec2 <
emax2)%
Z.
Let binary_float1 :=
binary_float prec1 emax1.
Let binary_float2 :=
binary_float prec2 emax2.
Definition Bconv (
conv_nan:
bool ->
nan_pl prec1 ->
bool *
nan_pl prec2) (
md:
mode) (
f:
binary_float1) :
binary_float2 :=
match f with
|
B754_nan _ _ s pl =>
let '(
s,
pl) :=
conv_nan s pl in B754_nan _ _ s pl
|
B754_infinity _ _ s =>
B754_infinity _ _ s
|
B754_zero _ _ s =>
B754_zero _ _ s
|
B754_finite _ _ s m e _ =>
binary_normalize _ _ _ Hmax2 md (
cond_Zopp s (
Zpos m))
e s
end.
Theorem Bconv_correct:
forall conv_nan m f,
is_finite _ _ f =
true ->
if Rlt_bool (
Rabs (
round radix2 fexp2 (
round_mode m) (
B2R _ _ f))) (
bpow radix2 emax2)
then
B2R _ _ (
Bconv conv_nan m f) =
round radix2 fexp2 (
round_mode m) (
B2R _ _ f)
/\
is_finite _ _ (
Bconv conv_nan m f) =
true
/\
Bsign _ _ (
Bconv conv_nan m f) =
Bsign _ _ f
else
B2FF _ _ (
Bconv conv_nan m f) =
binary_overflow prec2 emax2 m (
Bsign _ _ f).
Proof.
Converting a finite FP number to higher or equal precision preserves its value.
Theorem Bconv_widen_exact:
(
prec2 >=
prec1)%
Z -> (
emax2 >=
emax1)%
Z ->
forall conv_nan m f,
is_finite _ _ f =
true ->
B2R _ _ (
Bconv conv_nan m f) =
B2R _ _ f
/\
is_finite _ _ (
Bconv conv_nan m f) =
true
/\
Bsign _ _ (
Bconv conv_nan m f) =
Bsign _ _ f.
Proof.
Conversion from integers and change of format
Theorem Bconv_BofZ:
forall conv_nan n,
integer_representable prec1 emax1 n ->
Bconv conv_nan mode_NE (
BofZ prec1 emax1 _ Hmax1 n) =
BofZ prec2 emax2 _ Hmax2 n.
Proof.
Change of format (to higher precision) and conversion to integer.
Theorem ZofB_Bconv:
prec2 >=
prec1 ->
emax2 >=
emax1 ->
forall conv_nan m f n,
ZofB _ _ f =
Some n ->
ZofB _ _ (
Bconv conv_nan m f) =
Some n.
Proof.
Theorem ZofB_range_Bconv:
forall min1 max1 min2 max2,
prec2 >=
prec1 ->
emax2 >=
emax1 ->
min2 <=
min1 ->
max1 <=
max2 ->
forall conv_nan m f n,
ZofB_range _ _ f min1 max1 =
Some n ->
ZofB_range _ _ (
Bconv conv_nan m f)
min2 max2 =
Some n.
Proof.
Change of format (to higher precision) and comparison.
Theorem Bcompare_Bconv_widen:
prec2 >=
prec1 ->
emax2 >=
emax1 ->
forall conv_nan m x y,
Bcompare _ _ (
Bconv conv_nan m x) (
Bconv conv_nan m y) =
Bcompare _ _ x y.
Proof.
End Conversions.
Section Compose_Conversions.
Variable prec1 emax1 prec2 emax2 :
Z.
Context (
prec1_gt_0_ :
Prec_gt_0 prec1) (
prec2_gt_0_ :
Prec_gt_0 prec2).
Let emin1 := (3 -
emax1 -
prec1)%
Z.
Let fexp1 :=
FLT_exp emin1 prec1.
Let emin2 := (3 -
emax2 -
prec2)%
Z.
Let fexp2 :=
FLT_exp emin2 prec2.
Hypothesis Hmax1 : (
prec1 <
emax1)%
Z.
Hypothesis Hmax2 : (
prec2 <
emax2)%
Z.
Let binary_float1 :=
binary_float prec1 emax1.
Let binary_float2 :=
binary_float prec2 emax2.
Converting to a higher precision then down to the original format
is the identity.
Theorem Bconv_narrow_widen:
prec2 >=
prec1 ->
emax2 >=
emax1 ->
forall narrow_nan widen_nan m f,
is_nan _ _ f =
false ->
Bconv prec2 emax2 prec1 emax1 _ Hmax1 narrow_nan m (
Bconv prec1 emax1 prec2 emax2 _ Hmax2 widen_nan m f) =
f.
Proof.
End Compose_Conversions.